From 6b869838f107d4c92bd80c0cab3543131f406a73 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 3 Aug 2019 09:58:49 -0400
Subject: [PATCH] literal: remove teddy

The Teddy algorithm has been moved to the aho-corasick crate (as of
0.7.5), so we can now use it there.

Note that we do explicitly use aho-corasick's `packed` module instead of
relying on `AhoCorasick`'s prefilter to do it for us. The reasoning,
unfortunately, is that using Teddy inside of `AhoCorasick` has some
measurable overhead that we'd like to avoid. It would be better to
figure out how to remove that overhead, but I was unsuccessful. It isn't
much additional work to reach around and used the packed search
directly.

Benchmarks roughly stay the  same, with the `regexdna::variant`
benchmarks improving across the board by about 1.5x-2x. Some benchmarks
do regress (e.g., `sherlock::the_nocase`), but we decide to live with it
for now. The regression is likely due to subtle changes in Teddy's
bucket allocation.
---
 Cargo.toml                          |   2 +-
 src/lib.rs                          |   1 -
 src/{literal/mod.rs => literal.rs}  |  92 +---
 src/literal/teddy_avx2/fallback.rs  |  32 --
 src/literal/teddy_avx2/imp.rs       | 478 -----------------
 src/literal/teddy_avx2/mod.rs       |   8 -
 src/literal/teddy_ssse3/fallback.rs |  32 --
 src/literal/teddy_ssse3/imp.rs      | 785 ----------------------------
 src/literal/teddy_ssse3/mod.rs      |   8 -
 src/vector/avx2.rs                  | 183 -------
 src/vector/mod.rs                   |   4 -
 src/vector/ssse3.rs                 | 186 -------
 12 files changed, 29 insertions(+), 1782 deletions(-)
 rename src/{literal/mod.rs => literal.rs} (92%)
 delete mode 100644 src/literal/teddy_avx2/fallback.rs
 delete mode 100644 src/literal/teddy_avx2/imp.rs
 delete mode 100644 src/literal/teddy_avx2/mod.rs
 delete mode 100644 src/literal/teddy_ssse3/fallback.rs
 delete mode 100644 src/literal/teddy_ssse3/imp.rs
 delete mode 100644 src/literal/teddy_ssse3/mod.rs
 delete mode 100644 src/vector/avx2.rs
 delete mode 100644 src/vector/mod.rs
 delete mode 100644 src/vector/ssse3.rs
diff --git a/Cargo.toml b/Cargo.toml
index a58f257ee0..c92e36f4be 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -26,7 +26,7 @@ members = [
 
 [dependencies]
 # For very fast prefix literal matching.
-aho-corasick = "0.7.4"
+aho-corasick = "0.7.6"
 # For skipping along search text quickly when a leading byte is known.
 memchr = "2.2.1"
 # For managing regex caches quickly across multiple threads.
diff --git a/src/lib.rs b/src/lib.rs
index daaa9885da..b46179f095 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -659,7 +659,6 @@ mod re_trait;
 mod re_unicode;
 mod sparse;
 mod utf8;
-mod vector;
 
 /// The `internal` module exists to support suspicious activity, such as
 /// testing different matching engines and supporting the `regex-debug` CLI
diff --git a/src/literal/mod.rs b/src/literal.rs
similarity index 92%
rename from src/literal/mod.rs
rename to src/literal.rs
index 64123b6441..ae405cbf4a 100644
--- a/src/literal/mod.rs
+++ b/src/literal.rs
@@ -1,17 +1,12 @@
 use std::cmp;
 use std::mem;
 
-use aho_corasick::{self, AhoCorasick, AhoCorasickBuilder};
+use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder};
 use memchr::{memchr, memchr2, memchr3};
 use syntax::hir::literal::{Literal, Literals};
 
-use self::teddy_avx2::Teddy as TeddyAVX2;
-use self::teddy_ssse3::Teddy as TeddySSSE3;
 use freqs::BYTE_FREQUENCIES;
 
-mod teddy_avx2;
-mod teddy_ssse3;
-
 /// A prefix extracted from a compiled regular expression.
 ///
 /// A regex prefix is a set of literal strings that *must* be matched at the
@@ -37,12 +32,13 @@ enum Matcher {
     BoyerMoore(BoyerMooreSearch),
     /// An Aho-Corasick automaton.
     AC { ac: AhoCorasick<u32>, lits: Vec<Literal> },
-    /// A simd accelerated multiple string matcher. Used only for a small
-    /// number of small literals.
-    TeddySSSE3(TeddySSSE3),
-    /// A simd accelerated multiple string matcher. Used only for a small
-    /// number of small literals. This uses 256-bit vectors.
-    TeddyAVX2(TeddyAVX2),
+    /// A packed multiple substring searcher, using SIMD.
+    ///
+    /// Note that Aho-Corasick will actually use this packed searcher
+    /// internally automatically, however, there is some overhead associated
+    /// with going through the Aho-Corasick machinery. So using the packed
+    /// searcher directly results in some gains.
+    Packed { s: packed::Searcher, lits: Vec<Literal> },
 }
 
 impl LiteralSearcher {
@@ -95,8 +91,9 @@ impl LiteralSearcher {
             AC { ref ac, .. } => {
                 ac.find(haystack).map(|m| (m.start(), m.end()))
             }
-            TeddySSSE3(ref t) => t.find(haystack).map(|m| (m.start, m.end)),
-            TeddyAVX2(ref t) => t.find(haystack).map(|m| (m.start, m.end)),
+            Packed { ref s, .. } => {
+                s.find(haystack).map(|m| (m.start(), m.end()))
+            }
         }
     }
 
@@ -134,12 +131,7 @@ impl LiteralSearcher {
             Matcher::FreqyPacked(ref s) => LiteralIter::Single(&s.pat),
             Matcher::BoyerMoore(ref s) => LiteralIter::Single(&s.pattern),
             Matcher::AC { ref lits, .. } => LiteralIter::AC(lits),
-            Matcher::TeddySSSE3(ref ted) => {
-                LiteralIter::TeddySSSE3(ted.patterns())
-            }
-            Matcher::TeddyAVX2(ref ted) => {
-                LiteralIter::TeddyAVX2(ted.patterns())
-            }
+            Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits),
         }
     }
 
@@ -167,8 +159,7 @@ impl LiteralSearcher {
             FreqyPacked(_) => 1,
             BoyerMoore(_) => 1,
             AC { ref ac, .. } => ac.pattern_count(),
-            TeddySSSE3(ref ted) => ted.len(),
-            TeddyAVX2(ref ted) => ted.len(),
+            Packed { ref lits, .. } => lits.len(),
         }
     }
 
@@ -181,8 +172,7 @@ impl LiteralSearcher {
             FreqyPacked(ref single) => single.approximate_size(),
             BoyerMoore(ref single) => single.approximate_size(),
             AC { ref ac, .. } => ac.heap_bytes(),
-            TeddySSSE3(ref ted) => ted.approximate_size(),
-            TeddyAVX2(ref ted) => ted.approximate_size(),
+            Packed { ref s, .. } => s.heap_bytes(),
         }
     }
 }
@@ -222,34 +212,17 @@ impl Matcher {
                 return Matcher::FreqyPacked(FreqyPacked::new(lit));
             }
         }
-        let is_aho_corasick_fast = sset.dense.len() == 1 && sset.all_ascii;
-        if TeddyAVX2::available() && !is_aho_corasick_fast {
-            const MAX_TEDDY_LITERALS: usize = 32;
-            if lits.literals().len() <= MAX_TEDDY_LITERALS {
-                if let Some(ted) = TeddyAVX2::new(lits) {
-                    return Matcher::TeddyAVX2(ted);
-                }
-            }
-        }
-        if TeddySSSE3::available() && !is_aho_corasick_fast {
-            // Only try Teddy if Aho-Corasick can't use memchr on an ASCII
-            // byte. Also, in its current form, Teddy doesn't scale well to
-            // lots of literals.
-            //
-            // We impose the ASCII restriction since an alternation of
-            // non-ASCII string literals in the same language is likely to all
-            // start with the same byte. Even worse, the corpus being searched
-            // probably has a similar composition, which ends up completely
-            // negating the benefit of memchr.
-            const MAX_TEDDY_LITERALS: usize = 32;
-            if lits.literals().len() <= MAX_TEDDY_LITERALS {
-                if let Some(ted) = TeddySSSE3::new(lits) {
-                    return Matcher::TeddySSSE3(ted);
-                }
+
+        let pats = lits.literals().to_owned();
+        let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii;
+        if lits.literals().len() <= 100 && !is_aho_corasick_fast {
+            let mut builder = packed::Config::new()
+                .match_kind(packed::MatchKind::LeftmostFirst)
+                .builder();
+            if let Some(s) = builder.extend(&pats).build() {
+                return Matcher::Packed { s, lits: pats };
             }
-            // Fallthrough to ol' reliable Aho-Corasick...
         }
-        let pats = lits.literals().to_owned();
         let ac = AhoCorasickBuilder::new()
             .match_kind(aho_corasick::MatchKind::LeftmostFirst)
             .dfa(true)
@@ -264,8 +237,7 @@ pub enum LiteralIter<'a> {
     Bytes(&'a [u8]),
     Single(&'a [u8]),
     AC(&'a [Literal]),
-    TeddySSSE3(&'a [Vec<u8>]),
-    TeddyAVX2(&'a [Vec<u8>]),
+    Packed(&'a [Literal]),
 }
 
 impl<'a> Iterator for LiteralIter<'a> {
@@ -301,16 +273,7 @@ impl<'a> Iterator for LiteralIter<'a> {
                     Some(&**next)
                 }
             }
-            LiteralIter::TeddySSSE3(ref mut lits) => {
-                if lits.is_empty() {
-                    None
-                } else {
-                    let next = &lits[0];
-                    *lits = &lits[1..];
-                    Some(&**next)
-                }
-            }
-            LiteralIter::TeddyAVX2(ref mut lits) => {
+            LiteralIter::Packed(ref mut lits) => {
                 if lits.is_empty() {
                     None
                 } else {
@@ -809,8 +772,9 @@ impl BoyerMooreSearch {
                         if window_end - window_end_snapshot
                             > 16 * mem::size_of::<usize>()
                         {
-                            // Returning a window_end >= backstop will immediatly
-                            // break us out of the inner loop in `find`.
+                            // Returning a window_end >= backstop will
+                            // immediatly break us out of the inner loop in
+                            // `find`.
                             if window_end >= backstop {
                                 return Some(window_end);
                             }
diff --git a/src/literal/teddy_avx2/fallback.rs b/src/literal/teddy_avx2/fallback.rs
deleted file mode 100644
index 953895abc5..0000000000
--- a/src/literal/teddy_avx2/fallback.rs
+++ /dev/null
@@ -1,32 +0,0 @@
-use syntax::hir::literal::Literals;
-
-#[derive(Debug, Clone)]
-pub struct Teddy(());
-
-#[derive(Debug, Clone)]
-pub struct Match {
-    pub pat: usize,
-    pub start: usize,
-    pub end: usize,
-}
-
-impl Teddy {
-    pub fn available() -> bool {
-        false
-    }
-    pub fn new(_pats: &Literals) -> Option<Teddy> {
-        None
-    }
-    pub fn patterns(&self) -> &[Vec<u8>] {
-        &[]
-    }
-    pub fn len(&self) -> usize {
-        0
-    }
-    pub fn approximate_size(&self) -> usize {
-        0
-    }
-    pub fn find(&self, _haystack: &[u8]) -> Option<Match> {
-        None
-    }
-}
diff --git a/src/literal/teddy_avx2/imp.rs b/src/literal/teddy_avx2/imp.rs
deleted file mode 100644
index 1ce3adfa52..0000000000
--- a/src/literal/teddy_avx2/imp.rs
+++ /dev/null
@@ -1,478 +0,0 @@
-/*!
-This is the Teddy searcher, but ported to AVX2.
-
-See the module comments in the SSSE3 Teddy searcher for a more in depth
-explanation of how this algorithm works. For the most part, this port is
-basically the same as the SSSE3 version, but using 256-bit vectors instead of
-128-bit vectors, which increases throughput.
-*/
-
-use std::cmp;
-
-use aho_corasick::{self, AhoCorasick, AhoCorasickBuilder};
-use syntax::hir::literal::Literals;
-
-use vector::avx2::{u8x32, AVX2VectorBuilder};
-
-/// Corresponds to the number of bytes read at a time in the haystack.
-const BLOCK_SIZE: usize = 32;
-
-/// Match reports match information.
-#[derive(Debug, Clone)]
-pub struct Match {
-    /// The index of the pattern that matched. The index is in correspondence
-    /// with the order of the patterns given at construction.
-    pub pat: usize,
-    /// The start byte offset of the match.
-    pub start: usize,
-    /// The end byte offset of the match. This is always `start + pat.len()`.
-    pub end: usize,
-}
-
-/// A SIMD accelerated multi substring searcher.
-#[derive(Debug, Clone)]
-pub struct Teddy {
-    /// A builder for AVX2 empowered vectors.
-    vb: AVX2VectorBuilder,
-    /// A list of substrings to match.
-    pats: Vec<Vec<u8>>,
-    /// An Aho-Corasick automaton of the patterns. We use this when we need to
-    /// search pieces smaller than the Teddy block size.
-    ac: AhoCorasick,
-    /// A set of 8 buckets. Each bucket corresponds to a single member of a
-    /// bitset. A bucket contains zero or more substrings. This is useful
-    /// when the number of substrings exceeds 8, since our bitsets cannot have
-    /// more than 8 members.
-    buckets: Vec<Vec<usize>>,
-    /// Our set of masks. There's one mask for each byte in the fingerprint.
-    masks: Masks,
-}
-
-impl Teddy {
-    /// Returns true if and only if Teddy is supported on this platform.
-    ///
-    /// If this returns `false`, then `Teddy::new(...)` is guaranteed to
-    /// return `None`.
-    pub fn available() -> bool {
-        AVX2VectorBuilder::new().is_some()
-    }
-
-    /// Create a new `Teddy` multi substring matcher.
-    ///
-    /// If a `Teddy` matcher could not be created (e.g., `pats` is empty or has
-    /// an empty substring), then `None` is returned.
-    pub fn new(pats: &Literals) -> Option<Teddy> {
-        let vb = match AVX2VectorBuilder::new() {
-            None => return None,
-            Some(vb) => vb,
-        };
-        if !Teddy::available() {
-            return None;
-        }
-
-        let pats: Vec<_> =
-            pats.literals().iter().map(|p| p.to_vec()).collect();
-        let min_len = pats.iter().map(|p| p.len()).min().unwrap_or(0);
-        // Don't allow any empty patterns and require that we have at
-        // least one pattern.
-        if min_len < 1 {
-            return None;
-        }
-        // Pick the largest mask possible, but no larger than 3.
-        let nmasks = cmp::min(3, min_len);
-        let mut masks = Masks::new(vb, nmasks);
-        let mut buckets = vec![vec![]; 8];
-        // Assign a substring to each bucket, and add the bucket's bitfield to
-        // the appropriate position in the mask.
-        for (pati, pat) in pats.iter().enumerate() {
-            let bucket = pati % 8;
-            buckets[bucket].push(pati);
-            masks.add(bucket as u8, pat);
-        }
-        let ac = AhoCorasickBuilder::new()
-            .match_kind(aho_corasick::MatchKind::LeftmostFirst)
-            .dfa(true)
-            .prefilter(false)
-            .build(&pats);
-        Some(Teddy {
-            vb: vb,
-            pats: pats.to_vec(),
-            ac: ac,
-            buckets: buckets,
-            masks: masks,
-        })
-    }
-
-    /// Returns all of the substrings matched by this `Teddy`.
-    pub fn patterns(&self) -> &[Vec<u8>] {
-        &self.pats
-    }
-
-    /// Returns the number of substrings in this matcher.
-    pub fn len(&self) -> usize {
-        self.pats.len()
-    }
-
-    /// Returns the approximate size on the heap used by this matcher.
-    pub fn approximate_size(&self) -> usize {
-        self.pats.iter().fold(0, |a, b| a + b.len())
-    }
-
-    /// Searches `haystack` for the substrings in this `Teddy`. If a match was
-    /// found, then it is returned. Otherwise, `None` is returned.
-    pub fn find(&self, haystack: &[u8]) -> Option<Match> {
-        // This is safe because the only way we can construct a Teddy type
-        // is if AVX2 is available.
-        unsafe { self.find_impl(haystack) }
-    }
-
-    #[allow(unused_attributes)]
-    #[target_feature(enable = "avx2")]
-    unsafe fn find_impl(&self, haystack: &[u8]) -> Option<Match> {
-        // If our haystack is smaller than the block size, then fall back to
-        // a naive brute force search.
-        if haystack.is_empty() || haystack.len() < (BLOCK_SIZE + 2) {
-            return self.slow(haystack, 0);
-        }
-        match self.masks.len() {
-            0 => None,
-            1 => self.find1(haystack),
-            2 => self.find2(haystack),
-            3 => self.find3(haystack),
-            _ => unreachable!(),
-        }
-    }
-
-    /// `find1` is used when there is only 1 mask. This is the easy case and is
-    /// pretty much as described in the module documentation.
-    #[inline(always)]
-    fn find1(&self, haystack: &[u8]) -> Option<Match> {
-        let mut pos = 0;
-        let zero = self.vb.u8x32_splat(0);
-        let len = haystack.len();
-        debug_assert!(len >= BLOCK_SIZE);
-        while pos <= len - BLOCK_SIZE {
-            let h = unsafe {
-                // I tried and failed to eliminate bounds checks in safe code.
-                // This is safe because of our loop invariant: pos is always
-                // <= len-32.
-                let p = haystack.get_unchecked(pos..);
-                self.vb.u8x32_load_unchecked_unaligned(p)
-            };
-            // N.B. `res0` is our `C` in the module documentation.
-            let res0 = self.masks.members1(h);
-            // Only do expensive verification if there are any non-zero bits.
-            let bitfield = res0.ne(zero).movemask();
-            if bitfield != 0 {
-                if let Some(m) = self.verify(haystack, pos, res0, bitfield) {
-                    return Some(m);
-                }
-            }
-            pos += BLOCK_SIZE;
-        }
-        self.slow(haystack, pos)
-    }
-
-    /// `find2` is used when there are 2 masks, e.g., the fingerprint is 2 bytes
-    /// long.
-    #[inline(always)]
-    fn find2(&self, haystack: &[u8]) -> Option<Match> {
-        // This is an exotic way to right shift a SIMD vector across lanes.
-        // See below at use for more details.
-        let zero = self.vb.u8x32_splat(0);
-        let len = haystack.len();
-        // The previous value of `C` (from the module documentation) for the
-        // *first* byte in the fingerprint. On subsequent iterations, we take
-        // the last bitset from the previous `C` and insert it into the first
-        // position of the current `C`, shifting all other bitsets to the right
-        // one lane. This causes `C` for the first byte to line up with `C` for
-        // the second byte, so that they can be `AND`'d together.
-        let mut prev0 = self.vb.u8x32_splat(0xFF);
-        let mut pos = 1;
-        debug_assert!(len >= BLOCK_SIZE);
-        while pos <= len - BLOCK_SIZE {
-            let h = unsafe {
-                // I tried and failed to eliminate bounds checks in safe code.
-                // This is safe because of our loop invariant: pos is always
-                // <= len-32.
-                let p = haystack.get_unchecked(pos..);
-                self.vb.u8x32_load_unchecked_unaligned(p)
-            };
-            let (res0, res1) = self.masks.members2(h);
-
-            // Do this:
-            //
-            //     (prev0 << 15) | (res0 >> 1)
-            //
-            // This lets us line up our C values for each byte.
-            let res0prev0 = res0.alignr_15(prev0);
-
-            // `AND`'s our `C` values together.
-            let res = res0prev0.and(res1);
-            prev0 = res0;
-
-            let bitfield = res.ne(zero).movemask();
-            if bitfield != 0 {
-                let pos = pos.checked_sub(1).unwrap();
-                if let Some(m) = self.verify(haystack, pos, res, bitfield) {
-                    return Some(m);
-                }
-            }
-            pos += BLOCK_SIZE;
-        }
-        // The windowing above doesn't check the last byte in the last
-        // window, so start the slow search at the last byte of the last
-        // window.
-        self.slow(haystack, pos.checked_sub(1).unwrap())
-    }
-
-    /// `find3` is used when there are 3 masks, e.g., the fingerprint is 3 bytes
-    /// long.
-    ///
-    /// N.B. This is a straight-forward extrapolation of `find2`. The only
-    /// difference is that we need to keep track of two previous values of `C`,
-    /// since we now need to align for three bytes.
-    #[inline(always)]
-    fn find3(&self, haystack: &[u8]) -> Option<Match> {
-        let zero = self.vb.u8x32_splat(0);
-        let len = haystack.len();
-        let mut prev0 = self.vb.u8x32_splat(0xFF);
-        let mut prev1 = self.vb.u8x32_splat(0xFF);
-        let mut pos = 2;
-
-        while pos <= len - BLOCK_SIZE {
-            let h = unsafe {
-                // I tried and failed to eliminate bounds checks in safe code.
-                // This is safe because of our loop invariant: pos is always
-                // <= len-32.
-                let p = haystack.get_unchecked(pos..);
-                self.vb.u8x32_load_unchecked_unaligned(p)
-            };
-            let (res0, res1, res2) = self.masks.members3(h);
-
-            let res0prev0 = res0.alignr_14(prev0);
-            let res1prev1 = res1.alignr_15(prev1);
-            let res = res0prev0.and(res1prev1).and(res2);
-
-            prev0 = res0;
-            prev1 = res1;
-
-            let bitfield = res.ne(zero).movemask();
-            if bitfield != 0 {
-                let pos = pos.checked_sub(2).unwrap();
-                if let Some(m) = self.verify(haystack, pos, res, bitfield) {
-                    return Some(m);
-                }
-            }
-            pos += BLOCK_SIZE;
-        }
-        // The windowing above doesn't check the last two bytes in the last
-        // window, so start the slow search at the penultimate byte of the
-        // last window.
-        // self.slow(haystack, pos.saturating_sub(2))
-        self.slow(haystack, pos.checked_sub(2).unwrap())
-    }
-
-    /// Runs the verification procedure on `res` (i.e., `C` from the module
-    /// documentation), where the haystack block starts at `pos` in
-    /// `haystack`. `bitfield` has ones in the bit positions that `res` has
-    /// non-zero bytes.
-    ///
-    /// If a match exists, it returns the first one.
-    #[inline(always)]
-    fn verify(
-        &self,
-        haystack: &[u8],
-        pos: usize,
-        res: u8x32,
-        mut bitfield: u32,
-    ) -> Option<Match> {
-        let patterns = res.bytes();
-        while bitfield != 0 {
-            // The next offset, relative to pos, where some fingerprint
-            // matched.
-            let byte_pos = bitfield.trailing_zeros() as usize;
-            bitfield &= !(1 << byte_pos);
-
-            // Offset relative to the beginning of the haystack.
-            let start = pos + byte_pos;
-
-            // The bitfield telling us which patterns had fingerprints that
-            // match at this starting position.
-            let mut patterns = patterns[byte_pos];
-            while patterns != 0 {
-                let bucket = patterns.trailing_zeros() as usize;
-                patterns &= !(1 << bucket);
-
-                // Actual substring search verification.
-                if let Some(m) = self.verify_bucket(haystack, bucket, start) {
-                    return Some(m);
-                }
-            }
-        }
-
-        None
-    }
-
-    /// Verifies whether any substring in the given bucket matches in haystack
-    /// at the given starting position.
-    #[inline(always)]
-    fn verify_bucket(
-        &self,
-        haystack: &[u8],
-        bucket: usize,
-        start: usize,
-    ) -> Option<Match> {
-        // This cycles through the patterns in the bucket in the order that
-        // the patterns were given. Therefore, we guarantee leftmost-first
-        // semantics.
-        for &pati in &self.buckets[bucket] {
-            let pat = &*self.pats[pati];
-            if start + pat.len() > haystack.len() {
-                continue;
-            }
-            if pat == &haystack[start..start + pat.len()] {
-                return Some(Match {
-                    pat: pati,
-                    start: start,
-                    end: start + pat.len(),
-                });
-            }
-        }
-        None
-    }
-
-    /// Slow substring search through all patterns in this matcher.
-    ///
-    /// This is used when we don't have enough bytes in the haystack for our
-    /// block based approach.
-    #[inline(never)]
-    fn slow(&self, haystack: &[u8], pos: usize) -> Option<Match> {
-        self.ac.find(&haystack[pos..]).map(|m| Match {
-            pat: m.pattern(),
-            start: pos + m.start(),
-            end: pos + m.end(),
-        })
-    }
-}
-
-/// A list of masks. This has length equal to the length of the fingerprint.
-/// The length of the fingerprint is always `min(3, len(smallest_substring))`.
-#[derive(Debug, Clone)]
-struct Masks {
-    vb: AVX2VectorBuilder,
-    masks: [Mask; 3],
-    size: usize,
-}
-
-impl Masks {
-    /// Create a new set of masks of size `n`, where `n` corresponds to the
-    /// number of bytes in a fingerprint.
-    fn new(vb: AVX2VectorBuilder, n: usize) -> Masks {
-        Masks {
-            vb: vb,
-            masks: [Mask::new(vb), Mask::new(vb), Mask::new(vb)],
-            size: n,
-        }
-    }
-
-    /// Returns the number of masks.
-    fn len(&self) -> usize {
-        self.size
-    }
-
-    /// Adds the given pattern to the given bucket. The bucket should be a
-    /// power of `2 <= 2^7`.
-    fn add(&mut self, bucket: u8, pat: &[u8]) {
-        for i in 0..self.len() {
-            self.masks[i].add(bucket, pat[i]);
-        }
-    }
-
-    /// Finds the fingerprints that are in the given haystack block. i.e., this
-    /// returns `C` as described in the module documentation.
-    ///
-    /// More specifically, `for i in 0..16` and `j in 0..8, C[i][j] == 1` if and
-    /// only if `haystack_block[i]` corresponds to a fingerprint that is part
-    /// of a pattern in bucket `j`.
-    #[inline(always)]
-    fn members1(&self, haystack_block: u8x32) -> u8x32 {
-        let masklo = self.vb.u8x32_splat(0xF);
-        let hlo = haystack_block.and(masklo);
-        let hhi = haystack_block.bit_shift_right_4().and(masklo);
-
-        self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi))
-    }
-
-    /// Like members1, but computes C for the first and second bytes in the
-    /// fingerprint.
-    #[inline(always)]
-    fn members2(&self, haystack_block: u8x32) -> (u8x32, u8x32) {
-        let masklo = self.vb.u8x32_splat(0xF);
-        let hlo = haystack_block.and(masklo);
-        let hhi = haystack_block.bit_shift_right_4().and(masklo);
-
-        let res0 =
-            self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi));
-        let res1 =
-            self.masks[1].lo.shuffle(hlo).and(self.masks[1].hi.shuffle(hhi));
-        (res0, res1)
-    }
-
-    /// Like `members1`, but computes `C` for the first, second and third bytes
-    /// in the fingerprint.
-    #[inline(always)]
-    fn members3(&self, haystack_block: u8x32) -> (u8x32, u8x32, u8x32) {
-        let masklo = self.vb.u8x32_splat(0xF);
-        let hlo = haystack_block.and(masklo);
-        let hhi = haystack_block.bit_shift_right_4().and(masklo);
-
-        let res0 =
-            self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi));
-        let res1 =
-            self.masks[1].lo.shuffle(hlo).and(self.masks[1].hi.shuffle(hhi));
-        let res2 =
-            self.masks[2].lo.shuffle(hlo).and(self.masks[2].hi.shuffle(hhi));
-        (res0, res1, res2)
-    }
-}
-
-/// A single mask.
-#[derive(Debug, Clone, Copy)]
-struct Mask {
-    /// Bitsets for the low nybbles in a fingerprint.
-    lo: u8x32,
-    /// Bitsets for the high nybbles in a fingerprint.
-    hi: u8x32,
-}
-
-impl Mask {
-    /// Create a new mask with no members.
-    fn new(vb: AVX2VectorBuilder) -> Mask {
-        Mask { lo: vb.u8x32_splat(0), hi: vb.u8x32_splat(0) }
-    }
-
-    /// Adds the given byte to the given bucket.
-    fn add(&mut self, bucket: u8, byte: u8) {
-        // Split our byte into two nybbles, and add each nybble to our
-        // mask.
-        let byte_lo = (byte & 0xF) as usize;
-        let byte_hi = (byte >> 4) as usize;
-
-        {
-            let mut lo_bytes = self.lo.bytes();
-            let lo = lo_bytes[byte_lo] | ((1 << bucket) as u8);
-            lo_bytes[byte_lo] = lo;
-            lo_bytes[byte_lo + 16] = lo;
-            self.lo.replace_bytes(lo_bytes);
-        }
-
-        {
-            let mut hi_bytes = self.hi.bytes();
-            let hi = hi_bytes[byte_hi] | ((1 << bucket) as u8);
-            hi_bytes[byte_hi] = hi;
-            hi_bytes[byte_hi + 16] = hi;
-            self.hi.replace_bytes(hi_bytes);
-        }
-    }
-}
diff --git a/src/literal/teddy_avx2/mod.rs b/src/literal/teddy_avx2/mod.rs
deleted file mode 100644
index 6930ed5028..0000000000
--- a/src/literal/teddy_avx2/mod.rs
+++ /dev/null
@@ -1,8 +0,0 @@
-pub use self::imp::*;
-
-#[cfg(target_arch = "x86_64")]
-mod imp;
-
-#[cfg(not(target_arch = "x86_64"))]
-#[path = "fallback.rs"]
-mod imp;
diff --git a/src/literal/teddy_ssse3/fallback.rs b/src/literal/teddy_ssse3/fallback.rs
deleted file mode 100644
index 953895abc5..0000000000
--- a/src/literal/teddy_ssse3/fallback.rs
+++ /dev/null
@@ -1,32 +0,0 @@
-use syntax::hir::literal::Literals;
-
-#[derive(Debug, Clone)]
-pub struct Teddy(());
-
-#[derive(Debug, Clone)]
-pub struct Match {
-    pub pat: usize,
-    pub start: usize,
-    pub end: usize,
-}
-
-impl Teddy {
-    pub fn available() -> bool {
-        false
-    }
-    pub fn new(_pats: &Literals) -> Option<Teddy> {
-        None
-    }
-    pub fn patterns(&self) -> &[Vec<u8>] {
-        &[]
-    }
-    pub fn len(&self) -> usize {
-        0
-    }
-    pub fn approximate_size(&self) -> usize {
-        0
-    }
-    pub fn find(&self, _haystack: &[u8]) -> Option<Match> {
-        None
-    }
-}
diff --git a/src/literal/teddy_ssse3/imp.rs b/src/literal/teddy_ssse3/imp.rs
deleted file mode 100644
index ac7ed11e9d..0000000000
--- a/src/literal/teddy_ssse3/imp.rs
+++ /dev/null
@@ -1,785 +0,0 @@
-/*!
-Teddy is a simd accelerated multiple substring matching algorithm. The name
-and the core ideas in the algorithm were learned from the [Hyperscan][1_u]
-project.
-
-
-Background
-----------
-
-The key idea of Teddy is to do *packed* substring matching. In the literature,
-packed substring matching is the idea of examining multiple bytes in a haystack
-at a time to detect matches. Implementations of, for example, memchr (which
-detects matches of a single byte) have been doing this for years. Only
-recently, with the introduction of various SIMD instructions, has this been
-extended to substring matching. The PCMPESTRI instruction (and its relatives),
-for example, implements substring matching in hardware. It is, however, limited
-to substrings of length 16 bytes or fewer, but this restriction is fine in a
-regex engine, since we rarely care about the performance difference between
-searching for a 16 byte literal and a 16 + N literal; 16 is already long
-enough. The key downside of the PCMPESTRI instruction, on current (2016) CPUs
-at least, is its latency and throughput. As a result, it is often faster to do
-substring search with a Boyer-Moore variant and a well placed memchr to quickly
-skip through the haystack.
-
-There are fewer results from the literature on packed substring matching,
-and even fewer for packed multiple substring matching. Ben-Kiki et al. [2]
-describes use of PCMPESTRI for substring matching, but is mostly theoretical
-and hand-waves performance. There is other theoretical work done by Bille [3]
-as well.
-
-The rest of the work in the field, as far as I'm aware, is by Faro and Kulekci
-and is generally focused on multiple pattern search. Their first paper [4a]
-introduces the concept of a fingerprint, which is computed for every block of
-N bytes in every pattern. The haystack is then scanned N bytes at a time and
-a fingerprint is computed in the same way it was computed for blocks in the
-patterns. If the fingerprint corresponds to one that was found in a pattern,
-then a verification step follows to confirm that one of the substrings with the
-corresponding fingerprint actually matches at the current location. Various
-implementation tricks are employed to make sure the fingerprint lookup is fast;
-typically by truncating the fingerprint. (This may, of course, provoke more
-steps in the verification process, so a balance must be struck.)
-
-The main downside of [4a] is that the minimum substring length is 32 bytes,
-presumably because of how the algorithm uses certain SIMD instructions. This
-essentially makes it useless for general purpose regex matching, where a small
-number of short patterns is far more likely.
-
-Faro and Kulekci published another paper [4b] that is conceptually very similar
-to [4a]. The key difference is that it uses the CRC32 instruction (introduced
-as part of SSE 4.2) to compute fingerprint values. This also enables the
-algorithm to work effectively on substrings as short as 7 bytes with 4 byte
-windows. 7 bytes is unfortunately still too long. The window could be
-technically shrunk to 2 bytes, thereby reducing minimum length to 3, but the
-small window size ends up negating most performance benefits—and it's likely
-the common case in a general purpose regex engine.
-
-Faro and Kulekci also published [4c] that appears to be intended as a
-replacement to using PCMPESTRI. In particular, it is specifically motivated by
-the high throughput/latency time of PCMPESTRI and therefore chooses other SIMD
-instructions that are faster. While this approach works for short substrings,
-I personally couldn't see a way to generalize it to multiple substring search.
-
-Faro and Kulekci have another paper [4d] that I haven't been able to read
-because it is behind a paywall.
-
-
-Teddy
------
-
-Finally, we get to Teddy. If the above literature review is complete, then it
-appears that Teddy is a novel algorithm. More than that, in my experience, it
-completely blows away the competition for short substrings, which is exactly
-what we want in a general purpose regex engine. Again, the algorithm appears
-to be developed by the authors of [Hyperscan][1_u]. Hyperscan was open sourced
-late 2015, and no earlier history could be found. Therefore, tracking the exact
-provenance of the algorithm with respect to the published literature seems
-difficult.
-
-DISCLAIMER: My understanding of Teddy is limited to reading auto-generated C
-code, its disassembly and observing its runtime behavior.
-
-At a high level, Teddy works somewhat similarly to the fingerprint algorithms
-published by Faro and Kulekci, but Teddy does it in a way that scales a bit
-better. Namely:
-
-1. Teddy's core algorithm scans the haystack in 16 byte chunks. 16 is
-   significant because it corresponds to the number of bytes in a SIMD vector.
-   If one used AVX2 instructions, then we could scan the haystack in 32 byte
-   chunks. Similarly, if one used AVX512 instructions, we could scan the
-   haystack in 64 byte chunks. Hyperscan implements SSE + AVX2, we only
-   implement SSE for the moment.
-2. Bitwise operations are performed on each chunk to discover if any region of
-   it matches a set of precomputed fingerprints from the patterns. If there are
-   matches, then a verification step is performed. In this implementation, our
-   verification step is naive. This can be improved upon.
-
-The details to make this work are quite clever. First, we must choose how to
-pick our fingerprints. In Hyperscan's implementation, I *believe* they use the
-last N bytes of each substring, where N must be at least the minimum length of
-any substring in the set being searched. In this implementation, we use the
-first N bytes of each substring. (The tradeoffs between these choices aren't
-yet clear to me.) We then must figure out how to quickly test whether an
-occurrence of any fingerprint from the set of patterns appears in a 16 byte
-block from the haystack. To keep things simple, let's assume N = 1 and examine
-some examples to motivate the approach. Here are our patterns:
-
-```ignore
-foo
-bar
-baz
-```
-
-The corresponding fingerprints, for N = 1, are `f`, `b` and `b`. Now let's set
-our 16 byte block to:
-
-```ignore
-bat cat foo bump
-xxxxxxxxxxxxxxxx
-```
-
-To cut to the chase, Teddy works by using bitsets. In particular, Teddy creates
-a mask that allows us to quickly compute membership of a fingerprint in a 16
-byte block that also tells which pattern the fingerprint corresponds to. In
-this case, our fingerprint is a single byte, so an appropriate abstraction is
-a map from a single byte to a list of patterns that contain that fingerprint:
-
-```ignore
-f |--> foo
-b |--> bar, baz
-```
-
-Now, all we need to do is figure out how to represent this map in vector space
-and use normal SIMD operations to perform a lookup. The first simplification
-we can make is to represent our patterns as bit fields occupying a single
-byte. This is important, because a single SIMD vector can store 16 bytes.
-
-```ignore
-f |--> 00000001
-b |--> 00000010, 00000100
-```
-
-How do we perform lookup though? It turns out that SSSE3 introduced a very cool
-instruction called PSHUFB. The instruction takes two SIMD vectors, `A` and `B`,
-and returns a third vector `C`. All vectors are treated as 16 8-bit integers.
-`C` is formed by `C[i] = A[B[i]]`. (This is a bit of a simplification, but true
-for the purposes of this algorithm. For full details, see [Intel's Intrinsics
-Guide][5_u].) This essentially lets us use the values in `B` to lookup values
-in `A`.
-
-If we could somehow cause `B` to contain our 16 byte block from the haystack,
-and if `A` could contain our bitmasks, then we'd end up with something like
-this for `A`:
-
-```ignore
-    0x00 0x01 ... 0x62      ... 0x66      ... 0xFF
-A = 0    0        00000110      00000001      0
-```
-
-And if `B` contains our window from our haystack, we could use shuffle to take
-the values from `B` and use them to look up our bitsets in `A`. But of course,
-we can't do this because `A` in the above example contains 256 bytes, which
-is much larger than the size of a SIMD vector.
-
-Nybbles to the rescue! A nybble is 4 bits. Instead of one mask to hold all of
-our bitsets, we can use two masks, where one mask corresponds to the lower four
-bits of our fingerprint and the other mask corresponds to the upper four bits.
-So our map now looks like:
-
-```ignore
-'f' & 0xF = 0x6 |--> 00000001
-'f' >> 4  = 0x6 |--> 00000111
-'b' & 0xF = 0x2 |--> 00000110
-'b' >> 4  = 0x6 |--> 00000111
-```
-
-Notice that the bitsets for each nybble correspond to the union of all
-fingerprints that contain that nybble. For example, both `f` and `b` have the
-same upper 4 bits but differ on the lower 4 bits. Putting this together, we
-have `A0`, `A1` and `B`, where `A0` is our mask for the lower nybble, `A1` is
-our mask for the upper nybble and `B` is our 16 byte block from the haystack:
-
-```ignore
-      0x00 0x01 0x02      0x03 ... 0x06      ... 0xF
-A0 =  0    0    00000110  0        00000001      0
-A1 =  0    0    0         0        00000111      0
-B  =  b    a    t         _        t             p
-B  =  0x62 0x61 0x74      0x20     0x74          0x70
-```
-
-But of course, we can't use `B` with `PSHUFB` yet, since its values are 8 bits,
-and we need indexes that are at most 4 bits (corresponding to one of 16
-values). We can apply the same transformation to split `B` into lower and upper
-nybbles as we did `A`. As before, `B0` corresponds to the lower nybbles and
-`B1` corresponds to the upper nybbles:
-
-```ignore
-     b   a   t   _   c   a   t   _   f   o   o   _   b   u   m   p
-B0 = 0x2 0x1 0x4 0x0 0x3 0x1 0x4 0x0 0x6 0xF 0xF 0x0 0x2 0x5 0xD 0x0
-B1 = 0x6 0x6 0x7 0x2 0x6 0x6 0x7 0x2 0x6 0x6 0x6 0x2 0x6 0x7 0x6 0x7
-```
-
-And now we have a nice correspondence. `B0` can index `A0` and `B1` can index
-`A1`. Here's what we get when we apply `C0 = PSHUFB(A0, B0)`:
-
-```ignore
-     b         a        ... f         o         ... p
-     A0[0x2]   A0[0x1]      A0[0x6]   A0[0xF]       A0[0x0]
-C0 = 00000110  0            00000001  0             0
-```
-
-And `C1 = PSHUFB(A1, B1)`:
-
-```ignore
-     b         a        ... f         o        ... p
-     A1[0x6]   A1[0x6]      A1[0x6]   A1[0x6]      A1[0x7]
-C1 = 00000111  00000111     00000111  00000111     0
-```
-
-Notice how neither one of `C0` or `C1` is guaranteed to report fully correct
-results all on its own. For example, `C1` claims that `b` is a fingerprint for
-the pattern `foo` (since `A1[0x6] = 00000111`), and that `o` is a fingerprint
-for all of our patterns. But if we combined `C0` and `C1` with an `AND`
-operation:
-
-```ignore
-     b         a        ... f         o        ... p
-C  = 00000110  0            00000001  0            0
-```
-
-Then we now have that `C[i]` contains a bitset corresponding to the matching
-fingerprints in a haystack's 16 byte block, where `i` is the `ith` byte in that
-block.
-
-Once we have that, we can look for the position of the least significant bit
-in `C`. That position, modulo `8`, gives us the pattern that the fingerprint
-matches. That position, integer divided by `8`, also gives us the byte offset
-that the fingerprint occurs in inside the 16 byte haystack block. Using those
-two pieces of information, we can run a verification procedure that tries
-to match all substrings containing that fingerprint at that position in the
-haystack.
-
-
-Implementation notes
---------------------
-
-The problem with the algorithm as described above is that it uses a single byte
-for a fingerprint. This will work well if the fingerprints are rare in the
-haystack (e.g., capital letters or special characters in normal English text),
-but if the fingerprints are common, you'll wind up spending too much time in
-the verification step, which effectively negate the performance benefits of
-scanning 16 bytes at a time. Remember, the key to the performance of this
-algorithm is to do as little work as possible per 16 bytes.
-
-This algorithm can be extrapolated in a relatively straight-forward way to use
-larger fingerprints. That is, instead of a single byte prefix, we might use a
-three byte prefix. The implementation below implements N = {1, 2, 3} and always
-picks the largest N possible. The rationale is that the bigger the fingerprint,
-the fewer verification steps we'll do. Of course, if N is too large, then we'll
-end up doing too much on each step.
-
-The way to extend it is:
-
-1. Add a mask for each byte in the fingerprint. (Remember that each mask is
-   composed of two SIMD vectors.) This results in a value of `C` for each byte
-   in the fingerprint while searching.
-2. When testing each 16 byte block, each value of `C` must be shifted so that
-   they are aligned. Once aligned, they should all be `AND`'d together. This
-   will give you only the bitsets corresponding to the full match of the
-   fingerprint.
-
-The implementation below is commented to fill in the nitty gritty details.
-
-References
-----------
-
-- **[1]** [Hyperscan on GitHub](https://github.com/01org/hyperscan),
-    [webpage](https://01.org/hyperscan)
-- **[2a]** Ben-Kiki, O., Bille, P., Breslauer, D., Gasieniec, L., Grossi, R.,
-    & Weimann, O. (2011).
-    _Optimal packed string matching_.
-    In LIPIcs-Leibniz International Proceedings in Informatics (Vol. 13).
-    Schloss Dagstuhl-Leibniz-Zentrum fuer Informatik.
-    DOI: 10.4230/LIPIcs.FSTTCS.2011.423.
-    [PDF](http://drops.dagstuhl.de/opus/volltexte/2011/3355/pdf/37.pdf).
-- **[2b]** Ben-Kiki, O., Bille, P., Breslauer, D., Ga̧sieniec, L., Grossi, R.,
-    & Weimann, O. (2014).
-    _Towards optimal packed string matching_.
-    Theoretical Computer Science, 525, 111-129.
-    DOI: 10.1016/j.tcs.2013.06.013.
-    [PDF](http://www.cs.haifa.ac.il/~oren/Publications/bpsm.pdf).
-- **[3]** Bille, P. (2011).
-    _Fast searching in packed strings_.
-    Journal of Discrete Algorithms, 9(1), 49-56.
-    DOI: 10.1016/j.jda.2010.09.003.
-    [PDF](http://www.sciencedirect.com/science/article/pii/S1570866710000353).
-- **[4a]** Faro, S., & Külekci, M. O. (2012, October).
-    _Fast multiple string matching using streaming SIMD extensions technology_.
-    In String Processing and Information Retrieval (pp. 217-228).
-    Springer Berlin Heidelberg.
-    DOI: 10.1007/978-3-642-34109-0_23.
-    [PDF](http://www.dmi.unict.it/~faro/papers/conference/faro32.pdf).
-- **[4b]** Faro, S., & Külekci, M. O. (2013, September).
-    _Towards a Very Fast Multiple String Matching Algorithm for Short Patterns_.
-    In Stringology (pp. 78-91).
-    [PDF](http://www.dmi.unict.it/~faro/papers/conference/faro36.pdf).
-- **[4c]** Faro, S., & Külekci, M. O. (2013, January).
-    _Fast packed string matching for short patterns_.
-    In Proceedings of the Meeting on Algorithm Engineering & Expermiments
-    (pp. 113-121).
-    Society for Industrial and Applied Mathematics.
-    [PDF](http://arxiv.org/pdf/1209.6449.pdf).
-- **[4d]** Faro, S., & Külekci, M. O. (2014).
-    _Fast and flexible packed string matching_.
-    Journal of Discrete Algorithms, 28, 61-72.
-    DOI: 10.1016/j.jda.2014.07.003.
-
-[1_u]: https://github.com/01org/hyperscan
-[5_u]: https://software.intel.com/sites/landingpage/IntrinsicsGuide
-*/
-
-use std::cmp;
-
-use aho_corasick::{self, AhoCorasick, AhoCorasickBuilder};
-use syntax::hir::literal::Literals;
-
-use vector::ssse3::{u8x16, SSSE3VectorBuilder};
-
-/// Corresponds to the number of bytes read at a time in the haystack.
-const BLOCK_SIZE: usize = 16;
-
-/// Match reports match information.
-#[derive(Debug, Clone)]
-pub struct Match {
-    /// The index of the pattern that matched. The index is in correspondence
-    /// with the order of the patterns given at construction.
-    pub pat: usize,
-    /// The start byte offset of the match.
-    pub start: usize,
-    /// The end byte offset of the match. This is always `start + pat.len()`.
-    pub end: usize,
-}
-
-/// A SIMD accelerated multi substring searcher.
-#[derive(Debug, Clone)]
-pub struct Teddy {
-    /// A builder for SSSE3 empowered vectors.
-    vb: SSSE3VectorBuilder,
-    /// A list of substrings to match.
-    pats: Vec<Vec<u8>>,
-    /// An Aho-Corasick automaton of the patterns. We use this when we need to
-    /// search pieces smaller than the Teddy block size.
-    ac: AhoCorasick,
-    /// A set of 8 buckets. Each bucket corresponds to a single member of a
-    /// bitset. A bucket contains zero or more substrings. This is useful
-    /// when the number of substrings exceeds 8, since our bitsets cannot have
-    /// more than 8 members.
-    buckets: Vec<Vec<usize>>,
-    /// Our set of masks. There's one mask for each byte in the fingerprint.
-    masks: Masks,
-}
-
-impl Teddy {
-    /// Returns true if and only if Teddy is supported on this platform.
-    ///
-    /// If this returns `false`, then `Teddy::new(...)` is guaranteed to
-    /// return `None`.
-    pub fn available() -> bool {
-        SSSE3VectorBuilder::new().is_some()
-    }
-
-    /// Create a new `Teddy` multi substring matcher.
-    ///
-    /// If a `Teddy` matcher could not be created (e.g., `pats` is empty or has
-    /// an empty substring), then `None` is returned.
-    pub fn new(pats: &Literals) -> Option<Teddy> {
-        let vb = match SSSE3VectorBuilder::new() {
-            None => return None,
-            Some(vb) => vb,
-        };
-        if !Teddy::available() {
-            return None;
-        }
-
-        let pats: Vec<_> =
-            pats.literals().iter().map(|p| p.to_vec()).collect();
-        let min_len = pats.iter().map(|p| p.len()).min().unwrap_or(0);
-        // Don't allow any empty patterns and require that we have at
-        // least one pattern.
-        if min_len < 1 {
-            return None;
-        }
-        // Pick the largest mask possible, but no larger than 3.
-        let nmasks = cmp::min(3, min_len);
-        let mut masks = Masks::new(vb, nmasks);
-        let mut buckets = vec![vec![]; 8];
-        // Assign a substring to each bucket, and add the bucket's bitfield to
-        // the appropriate position in the mask.
-        for (pati, pat) in pats.iter().enumerate() {
-            let bucket = pati % 8;
-            buckets[bucket].push(pati);
-            masks.add(bucket as u8, pat);
-        }
-        let ac = AhoCorasickBuilder::new()
-            .match_kind(aho_corasick::MatchKind::LeftmostFirst)
-            .dfa(true)
-            .prefilter(false)
-            .build(&pats);
-        Some(Teddy {
-            vb: vb,
-            pats: pats.to_vec(),
-            ac: ac,
-            buckets: buckets,
-            masks: masks,
-        })
-    }
-
-    /// Returns all of the substrings matched by this `Teddy`.
-    pub fn patterns(&self) -> &[Vec<u8>] {
-        &self.pats
-    }
-
-    /// Returns the number of substrings in this matcher.
-    pub fn len(&self) -> usize {
-        self.pats.len()
-    }
-
-    /// Returns the approximate size on the heap used by this matcher.
-    pub fn approximate_size(&self) -> usize {
-        self.pats.iter().fold(0, |a, b| a + b.len())
-    }
-
-    /// Searches `haystack` for the substrings in this `Teddy`. If a match was
-    /// found, then it is returned. Otherwise, `None` is returned.
-    pub fn find(&self, haystack: &[u8]) -> Option<Match> {
-        // This is safe because the only way we can construct a Teddy type
-        // is if SSSE3 is available.
-        unsafe { self.find_impl(haystack) }
-    }
-
-    #[allow(unused_attributes)]
-    #[target_feature(enable = "ssse3")]
-    unsafe fn find_impl(&self, haystack: &[u8]) -> Option<Match> {
-        // If our haystack is smaller than the block size, then fall back to
-        // a naive brute force search.
-        if haystack.is_empty() || haystack.len() < (BLOCK_SIZE + 2) {
-            return self.slow(haystack, 0);
-        }
-        match self.masks.len() {
-            0 => None,
-            1 => self.find1(haystack),
-            2 => self.find2(haystack),
-            3 => self.find3(haystack),
-            _ => unreachable!(),
-        }
-    }
-
-    /// `find1` is used when there is only 1 mask. This is the easy case and is
-    /// pretty much as described in the module documentation.
-    #[inline(always)]
-    fn find1(&self, haystack: &[u8]) -> Option<Match> {
-        let mut pos = 0;
-        let zero = self.vb.u8x16_splat(0);
-        let len = haystack.len();
-        debug_assert!(len >= BLOCK_SIZE);
-        while pos <= len - BLOCK_SIZE {
-            let h = unsafe {
-                // I tried and failed to eliminate bounds checks in safe code.
-                // This is safe because of our loop invariant: pos is always
-                // <= len-16.
-                let p = haystack.get_unchecked(pos..);
-                self.vb.u8x16_load_unchecked_unaligned(p)
-            };
-            // N.B. `res0` is our `C` in the module documentation.
-            let res0 = self.masks.members1(h);
-            // Only do expensive verification if there are any non-zero bits.
-            let bitfield = res0.ne(zero).movemask();
-            if bitfield != 0 {
-                if let Some(m) = self.verify(haystack, pos, res0, bitfield) {
-                    return Some(m);
-                }
-            }
-            pos += BLOCK_SIZE;
-        }
-        self.slow(haystack, pos)
-    }
-
-    /// `find2` is used when there are 2 masks, e.g., the fingerprint is 2 bytes
-    /// long.
-    #[inline(always)]
-    fn find2(&self, haystack: &[u8]) -> Option<Match> {
-        // This is an exotic way to right shift a SIMD vector across lanes.
-        // See below at use for more details.
-        let zero = self.vb.u8x16_splat(0);
-        let len = haystack.len();
-        // The previous value of `C` (from the module documentation) for the
-        // *first* byte in the fingerprint. On subsequent iterations, we take
-        // the last bitset from the previous `C` and insert it into the first
-        // position of the current `C`, shifting all other bitsets to the right
-        // one lane. This causes `C` for the first byte to line up with `C` for
-        // the second byte, so that they can be `AND`'d together.
-        let mut prev0 = self.vb.u8x16_splat(0xFF);
-        let mut pos = 1;
-        debug_assert!(len >= BLOCK_SIZE);
-        while pos <= len - BLOCK_SIZE {
-            let h = unsafe {
-                // I tried and failed to eliminate bounds checks in safe code.
-                // This is safe because of our loop invariant: pos is always
-                // <= len-16.
-                let p = haystack.get_unchecked(pos..);
-                self.vb.u8x16_load_unchecked_unaligned(p)
-            };
-            let (res0, res1) = self.masks.members2(h);
-
-            // Do this:
-            //
-            //     (prev0 << 15) | (res0 >> 1)
-            //
-            // This lets us line up our C values for each byte.
-            let res0prev0 = res0.alignr_15(prev0);
-
-            // `AND`'s our `C` values together.
-            let res = res0prev0.and(res1);
-            prev0 = res0;
-
-            let bitfield = res.ne(zero).movemask();
-            if bitfield != 0 {
-                let pos = pos.checked_sub(1).unwrap();
-                if let Some(m) = self.verify(haystack, pos, res, bitfield) {
-                    return Some(m);
-                }
-            }
-            pos += BLOCK_SIZE;
-        }
-        // The windowing above doesn't check the last byte in the last
-        // window, so start the slow search at the last byte of the last
-        // window.
-        self.slow(haystack, pos.checked_sub(1).unwrap())
-    }
-
-    /// `find3` is used when there are 3 masks, e.g., the fingerprint is 3 bytes
-    /// long.
-    ///
-    /// N.B. This is a straight-forward extrapolation of `find2`. The only
-    /// difference is that we need to keep track of two previous values of `C`,
-    /// since we now need to align for three bytes.
-    #[inline(always)]
-    fn find3(&self, haystack: &[u8]) -> Option<Match> {
-        let zero = self.vb.u8x16_splat(0);
-        let len = haystack.len();
-        let mut prev0 = self.vb.u8x16_splat(0xFF);
-        let mut prev1 = self.vb.u8x16_splat(0xFF);
-        let mut pos = 2;
-        while pos <= len - BLOCK_SIZE {
-            let h = unsafe {
-                // I tried and failed to eliminate bounds checks in safe code.
-                // This is safe because of our loop invariant: pos is always
-                // <= len-16.
-                let p = haystack.get_unchecked(pos..);
-                self.vb.u8x16_load_unchecked_unaligned(p)
-            };
-            let (res0, res1, res2) = self.masks.members3(h);
-
-            let res0prev0 = res0.alignr_14(prev0);
-            let res1prev1 = res1.alignr_15(prev1);
-            let res = res0prev0.and(res1prev1).and(res2);
-
-            prev0 = res0;
-            prev1 = res1;
-
-            let bitfield = res.ne(zero).movemask();
-            if bitfield != 0 {
-                let pos = pos.checked_sub(2).unwrap();
-                if let Some(m) = self.verify(haystack, pos, res, bitfield) {
-                    return Some(m);
-                }
-            }
-            pos += BLOCK_SIZE;
-        }
-        // The windowing above doesn't check the last two bytes in the last
-        // window, so start the slow search at the penultimate byte of the
-        // last window.
-        // self.slow(haystack, pos.saturating_sub(2))
-        self.slow(haystack, pos.checked_sub(2).unwrap())
-    }
-
-    /// Runs the verification procedure on `res` (i.e., `C` from the module
-    /// documentation), where the haystack block starts at `pos` in
-    /// `haystack`. `bitfield` has ones in the bit positions that `res` has
-    /// non-zero bytes.
-    ///
-    /// If a match exists, it returns the first one.
-    #[inline(always)]
-    fn verify(
-        &self,
-        haystack: &[u8],
-        pos: usize,
-        res: u8x16,
-        mut bitfield: u32,
-    ) -> Option<Match> {
-        let patterns = res.bytes();
-        while bitfield != 0 {
-            // The next offset, relative to pos, where some fingerprint
-            // matched.
-            let byte_pos = bitfield.trailing_zeros() as usize;
-            bitfield &= !(1 << byte_pos);
-
-            // Offset relative to the beginning of the haystack.
-            let start = pos + byte_pos;
-
-            // The bitfield telling us which patterns had fingerprints that
-            // match at this starting position.
-            let mut patterns = patterns[byte_pos];
-            while patterns != 0 {
-                let bucket = patterns.trailing_zeros() as usize;
-                patterns &= !(1 << bucket);
-
-                // Actual substring search verification.
-                if let Some(m) = self.verify_bucket(haystack, bucket, start) {
-                    return Some(m);
-                }
-            }
-        }
-
-        None
-    }
-
-    /// Verifies whether any substring in the given bucket matches in haystack
-    /// at the given starting position.
-    #[inline(always)]
-    fn verify_bucket(
-        &self,
-        haystack: &[u8],
-        bucket: usize,
-        start: usize,
-    ) -> Option<Match> {
-        // This cycles through the patterns in the bucket in the order that
-        // the patterns were given. Therefore, we guarantee leftmost-first
-        // semantics.
-        for &pati in &self.buckets[bucket] {
-            let pat = &*self.pats[pati];
-            if start + pat.len() > haystack.len() {
-                continue;
-            }
-            if pat == &haystack[start..start + pat.len()] {
-                return Some(Match {
-                    pat: pati,
-                    start: start,
-                    end: start + pat.len(),
-                });
-            }
-        }
-        None
-    }
-
-    /// Slow substring search through all patterns in this matcher.
-    ///
-    /// This is used when we don't have enough bytes in the haystack for our
-    /// block based approach.
-    #[inline(never)]
-    fn slow(&self, haystack: &[u8], pos: usize) -> Option<Match> {
-        self.ac.find(&haystack[pos..]).map(|m| Match {
-            pat: m.pattern(),
-            start: pos + m.start(),
-            end: pos + m.end(),
-        })
-    }
-}
-
-/// A list of masks. This has length equal to the length of the fingerprint.
-/// The length of the fingerprint is always `min(3, len(smallest_substring))`.
-#[derive(Debug, Clone)]
-struct Masks {
-    vb: SSSE3VectorBuilder,
-    masks: [Mask; 3],
-    size: usize,
-}
-
-impl Masks {
-    /// Create a new set of masks of size `n`, where `n` corresponds to the
-    /// number of bytes in a fingerprint.
-    fn new(vb: SSSE3VectorBuilder, n: usize) -> Masks {
-        Masks {
-            vb: vb,
-            masks: [Mask::new(vb), Mask::new(vb), Mask::new(vb)],
-            size: n,
-        }
-    }
-
-    /// Returns the number of masks.
-    fn len(&self) -> usize {
-        self.size
-    }
-
-    /// Adds the given pattern to the given bucket. The bucket should be a
-    /// power of `2 <= 2^7`.
-    fn add(&mut self, bucket: u8, pat: &[u8]) {
-        for i in 0..self.len() {
-            self.masks[i].add(bucket, pat[i]);
-        }
-    }
-
-    /// Finds the fingerprints that are in the given haystack block. i.e., this
-    /// returns `C` as described in the module documentation.
-    ///
-    /// More specifically, `for i in 0..16` and `j in 0..8, C[i][j] == 1` if and
-    /// only if `haystack_block[i]` corresponds to a fingerprint that is part
-    /// of a pattern in bucket `j`.
-    #[inline(always)]
-    fn members1(&self, haystack_block: u8x16) -> u8x16 {
-        let masklo = self.vb.u8x16_splat(0xF);
-        let hlo = haystack_block.and(masklo);
-        let hhi = haystack_block.bit_shift_right_4().and(masklo);
-
-        self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi))
-    }
-
-    /// Like members1, but computes C for the first and second bytes in the
-    /// fingerprint.
-    #[inline(always)]
-    fn members2(&self, haystack_block: u8x16) -> (u8x16, u8x16) {
-        let masklo = self.vb.u8x16_splat(0xF);
-        let hlo = haystack_block.and(masklo);
-        let hhi = haystack_block.bit_shift_right_4().and(masklo);
-
-        let res0 =
-            self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi));
-        let res1 =
-            self.masks[1].lo.shuffle(hlo).and(self.masks[1].hi.shuffle(hhi));
-        (res0, res1)
-    }
-
-    /// Like `members1`, but computes `C` for the first, second and third bytes
-    /// in the fingerprint.
-    #[inline(always)]
-    fn members3(&self, haystack_block: u8x16) -> (u8x16, u8x16, u8x16) {
-        let masklo = self.vb.u8x16_splat(0xF);
-        let hlo = haystack_block.and(masklo);
-        let hhi = haystack_block.bit_shift_right_4().and(masklo);
-
-        let res0 =
-            self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi));
-        let res1 =
-            self.masks[1].lo.shuffle(hlo).and(self.masks[1].hi.shuffle(hhi));
-        let res2 =
-            self.masks[2].lo.shuffle(hlo).and(self.masks[2].hi.shuffle(hhi));
-        (res0, res1, res2)
-    }
-}
-
-/// A single mask.
-#[derive(Debug, Clone, Copy)]
-struct Mask {
-    /// Bitsets for the low nybbles in a fingerprint.
-    lo: u8x16,
-    /// Bitsets for the high nybbles in a fingerprint.
-    hi: u8x16,
-}
-
-impl Mask {
-    /// Create a new mask with no members.
-    fn new(vb: SSSE3VectorBuilder) -> Mask {
-        Mask { lo: vb.u8x16_splat(0), hi: vb.u8x16_splat(0) }
-    }
-
-    /// Adds the given byte to the given bucket.
-    fn add(&mut self, bucket: u8, byte: u8) {
-        // Split our byte into two nybbles, and add each nybble to our
-        // mask.
-        let byte_lo = (byte & 0xF) as usize;
-        let byte_hi = (byte >> 4) as usize;
-
-        {
-            let mut lo_bytes = self.lo.bytes();
-            let lo = lo_bytes[byte_lo];
-            lo_bytes[byte_lo] = ((1 << bucket) as u8) | lo;
-            self.lo.replace_bytes(lo_bytes);
-        }
-        {
-            let mut hi_bytes = self.hi.bytes();
-            let hi = hi_bytes[byte_hi];
-            hi_bytes[byte_hi] = ((1 << bucket) as u8) | hi;
-            self.hi.replace_bytes(hi_bytes);
-        }
-    }
-}
diff --git a/src/literal/teddy_ssse3/mod.rs b/src/literal/teddy_ssse3/mod.rs
deleted file mode 100644
index 6930ed5028..0000000000
--- a/src/literal/teddy_ssse3/mod.rs
+++ /dev/null
@@ -1,8 +0,0 @@
-pub use self::imp::*;
-
-#[cfg(target_arch = "x86_64")]
-mod imp;
-
-#[cfg(not(target_arch = "x86_64"))]
-#[path = "fallback.rs"]
-mod imp;
diff --git a/src/vector/avx2.rs b/src/vector/avx2.rs
deleted file mode 100644
index 5228c60d1c..0000000000
--- a/src/vector/avx2.rs
+++ /dev/null
@@ -1,183 +0,0 @@
-#![allow(dead_code)]
-
-use std::arch::x86_64::*;
-use std::fmt;
-use std::mem;
-
-#[derive(Clone, Copy, Debug)]
-pub struct AVX2VectorBuilder(());
-
-impl AVX2VectorBuilder {
-    pub fn new() -> Option<AVX2VectorBuilder> {
-        if is_x86_feature_detected!("avx2") {
-            Some(AVX2VectorBuilder(()))
-        } else {
-            None
-        }
-    }
-
-    /// Create a new u8x32 AVX2 vector where all of the bytes are set to
-    /// the given value.
-    #[inline]
-    pub fn u8x32_splat(self, n: u8) -> u8x32 {
-        // Safe because we know AVX2 is enabled.
-        unsafe { u8x32::splat(n) }
-    }
-
-    /// Load 32 bytes from the given slice, with bounds checks.
-    #[inline]
-    pub fn u8x32_load_unaligned(self, slice: &[u8]) -> u8x32 {
-        // Safe because we know AVX2 is enabled.
-        unsafe { u8x32::load_unaligned(slice) }
-    }
-
-    /// Load 32 bytes from the given slice, without bounds checks.
-    #[inline]
-    pub unsafe fn u8x32_load_unchecked_unaligned(self, slice: &[u8]) -> u8x32 {
-        // Safe because we know AVX2 is enabled, but still unsafe
-        // because we aren't doing bounds checks.
-        u8x32::load_unchecked_unaligned(slice)
-    }
-
-    /// Load 32 bytes from the given slice, with bound and alignment checks.
-    #[inline]
-    pub fn u8x32_load(self, slice: &[u8]) -> u8x32 {
-        // Safe because we know AVX2 is enabled.
-        unsafe { u8x32::load(slice) }
-    }
-
-    /// Load 32 bytes from the given slice, without bound or alignment checks.
-    #[inline]
-    pub unsafe fn u8x32_load_unchecked(self, slice: &[u8]) -> u8x32 {
-        // Safe because we know AVX2 is enabled, but still unsafe
-        // because we aren't doing bounds checks.
-        u8x32::load_unchecked(slice)
-    }
-}
-
-#[derive(Clone, Copy)]
-#[allow(non_camel_case_types)]
-#[repr(transparent)]
-pub struct u8x32 {
-    vector: __m256i,
-}
-
-impl u8x32 {
-    #[inline]
-    unsafe fn splat(n: u8) -> u8x32 {
-        u8x32 { vector: _mm256_set1_epi8(n as i8) }
-    }
-
-    #[inline]
-    unsafe fn load_unaligned(slice: &[u8]) -> u8x32 {
-        assert!(slice.len() >= 32);
-        u8x32::load_unchecked_unaligned(slice)
-    }
-
-    #[inline]
-    unsafe fn load_unchecked_unaligned(slice: &[u8]) -> u8x32 {
-        let p = slice.as_ptr() as *const u8 as *const __m256i;
-        u8x32 { vector: _mm256_loadu_si256(p) }
-    }
-
-    #[inline]
-    unsafe fn load(slice: &[u8]) -> u8x32 {
-        assert!(slice.len() >= 32);
-        assert!(slice.as_ptr() as usize % 32 == 0);
-        u8x32::load_unchecked(slice)
-    }
-
-    #[inline]
-    unsafe fn load_unchecked(slice: &[u8]) -> u8x32 {
-        let p = slice.as_ptr() as *const u8 as *const __m256i;
-        u8x32 { vector: _mm256_load_si256(p) }
-    }
-
-    #[inline]
-    pub fn shuffle(self, indices: u8x32) -> u8x32 {
-        // Safe because we know AVX2 is enabled.
-        unsafe {
-            u8x32 { vector: _mm256_shuffle_epi8(self.vector, indices.vector) }
-        }
-    }
-
-    #[inline]
-    pub fn ne(self, other: u8x32) -> u8x32 {
-        // Safe because we know AVX2 is enabled.
-        unsafe {
-            let boolv = _mm256_cmpeq_epi8(self.vector, other.vector);
-            let ones = _mm256_set1_epi8(0xFF as u8 as i8);
-            u8x32 { vector: _mm256_andnot_si256(boolv, ones) }
-        }
-    }
-
-    #[inline]
-    pub fn and(self, other: u8x32) -> u8x32 {
-        // Safe because we know AVX2 is enabled.
-        unsafe {
-            u8x32 { vector: _mm256_and_si256(self.vector, other.vector) }
-        }
-    }
-
-    #[inline]
-    pub fn movemask(self) -> u32 {
-        // Safe because we know AVX2 is enabled.
-        unsafe { _mm256_movemask_epi8(self.vector) as u32 }
-    }
-
-    #[inline]
-    pub fn alignr_14(self, other: u8x32) -> u8x32 {
-        // Safe because we know AVX2 is enabled.
-        unsafe {
-            // Credit goes to jneem for figuring this out:
-            // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
-            //
-            // TL;DR avx2's PALIGNR instruction is actually just two 128-bit
-            // PALIGNR instructions, which is not what we want, so we need to
-            // do some extra shuffling.
-            let v = _mm256_permute2x128_si256(other.vector, self.vector, 0x21);
-            let v = _mm256_alignr_epi8(self.vector, v, 14);
-            u8x32 { vector: v }
-        }
-    }
-
-    #[inline]
-    pub fn alignr_15(self, other: u8x32) -> u8x32 {
-        // Safe because we know AVX2 is enabled.
-        unsafe {
-            // Credit goes to jneem for figuring this out:
-            // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
-            //
-            // TL;DR avx2's PALIGNR instruction is actually just two 128-bit
-            // PALIGNR instructions, which is not what we want, so we need to
-            // do some extra shuffling.
-            let v = _mm256_permute2x128_si256(other.vector, self.vector, 0x21);
-            let v = _mm256_alignr_epi8(self.vector, v, 15);
-            u8x32 { vector: v }
-        }
-    }
-
-    #[inline]
-    pub fn bit_shift_right_4(self) -> u8x32 {
-        // Safe because we know AVX2 is enabled.
-        unsafe { u8x32 { vector: _mm256_srli_epi16(self.vector, 4) } }
-    }
-
-    #[inline]
-    pub fn bytes(self) -> [u8; 32] {
-        // Safe because __m256i and [u8; 32] are layout compatible
-        unsafe { mem::transmute(self) }
-    }
-
-    #[inline]
-    pub fn replace_bytes(&mut self, value: [u8; 32]) {
-        // Safe because __m256i and [u8; 32] are layout compatible
-        self.vector = unsafe { mem::transmute(value) };
-    }
-}
-
-impl fmt::Debug for u8x32 {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        self.bytes().fmt(f)
-    }
-}
diff --git a/src/vector/mod.rs b/src/vector/mod.rs
deleted file mode 100644
index 1dbae19a09..0000000000
--- a/src/vector/mod.rs
+++ /dev/null
@@ -1,4 +0,0 @@
-#[cfg(target_arch = "x86_64")]
-pub mod avx2;
-#[cfg(target_arch = "x86_64")]
-pub mod ssse3;
diff --git a/src/vector/ssse3.rs b/src/vector/ssse3.rs
deleted file mode 100644
index 99e99ab87d..0000000000
--- a/src/vector/ssse3.rs
+++ /dev/null
@@ -1,186 +0,0 @@
-#![allow(dead_code)]
-
-use std::arch::x86_64::*;
-use std::fmt;
-use std::mem;
-
-/// A builder for SSSE3 empowered vectors.
-///
-/// This builder represents a receipt that the SSSE3 target feature is enabled
-/// on the currently running CPU. Namely, the only way to get a value of this
-/// type is if the SSSE3 feature is enabled.
-///
-/// This type can then be used to build vector types that use SSSE3 features
-/// safely.
-#[derive(Clone, Copy, Debug)]
-pub struct SSSE3VectorBuilder(());
-
-impl SSSE3VectorBuilder {
-    /// Create a new SSSE3 vector builder.
-    ///
-    /// If the SSSE3 feature is not enabled for the current target, then
-    /// return `None`.
-    pub fn new() -> Option<SSSE3VectorBuilder> {
-        if is_x86_feature_detected!("ssse3") {
-            Some(SSSE3VectorBuilder(()))
-        } else {
-            None
-        }
-    }
-
-    /// Create a new u8x16 SSSE3 vector where all of the bytes are set to
-    /// the given value.
-    #[inline]
-    pub fn u8x16_splat(self, n: u8) -> u8x16 {
-        // Safe because we know SSSE3 is enabled.
-        unsafe { u8x16::splat(n) }
-    }
-
-    /// Load 16 bytes from the given slice, with bounds checks.
-    #[inline]
-    pub fn u8x16_load_unaligned(self, slice: &[u8]) -> u8x16 {
-        // Safe because we know SSSE3 is enabled.
-        unsafe { u8x16::load_unaligned(slice) }
-    }
-
-    /// Load 16 bytes from the given slice, without bounds checks.
-    #[inline]
-    pub unsafe fn u8x16_load_unchecked_unaligned(self, slice: &[u8]) -> u8x16 {
-        // Safe because we know SSSE3 is enabled, but still unsafe
-        // because we aren't doing bounds checks.
-        u8x16::load_unchecked_unaligned(slice)
-    }
-
-    /// Load 16 bytes from the given slice, with bound and alignment checks.
-    #[inline]
-    pub fn u8x16_load(self, slice: &[u8]) -> u8x16 {
-        // Safe because we know SSSE3 is enabled.
-        unsafe { u8x16::load(slice) }
-    }
-
-    /// Load 16 bytes from the given slice, without bound or alignment checks.
-    #[inline]
-    pub unsafe fn u8x16_load_unchecked(self, slice: &[u8]) -> u8x16 {
-        // Safe because we know SSSE3 is enabled, but still unsafe
-        // because we aren't doing bounds checks.
-        u8x16::load_unchecked(slice)
-    }
-}
-
-/// A u8x16 is a 128-bit vector with 16 single-byte lanes.
-///
-/// It provides a safe API that uses only SSE2 or SSSE3 instructions.
-/// The only way for callers to construct a value of this type is
-/// through the SSSE3VectorBuilder type, and the only way to get a
-/// SSSE3VectorBuilder is if the `ssse3` target feature is enabled.
-///
-/// Note that generally speaking, all uses of this type should get
-/// inlined, otherwise you probably have a performance bug.
-#[derive(Clone, Copy)]
-#[allow(non_camel_case_types)]
-#[repr(transparent)]
-pub struct u8x16 {
-    vector: __m128i,
-}
-
-impl u8x16 {
-    #[inline]
-    unsafe fn splat(n: u8) -> u8x16 {
-        u8x16 { vector: _mm_set1_epi8(n as i8) }
-    }
-
-    #[inline]
-    unsafe fn load_unaligned(slice: &[u8]) -> u8x16 {
-        assert!(slice.len() >= 16);
-        u8x16::load_unchecked(slice)
-    }
-
-    #[inline]
-    unsafe fn load_unchecked_unaligned(slice: &[u8]) -> u8x16 {
-        let v = _mm_loadu_si128(slice.as_ptr() as *const u8 as *const __m128i);
-        u8x16 { vector: v }
-    }
-
-    #[inline]
-    unsafe fn load(slice: &[u8]) -> u8x16 {
-        assert!(slice.len() >= 16);
-        assert!(slice.as_ptr() as usize % 16 == 0);
-        u8x16::load_unchecked(slice)
-    }
-
-    #[inline]
-    unsafe fn load_unchecked(slice: &[u8]) -> u8x16 {
-        let v = _mm_load_si128(slice.as_ptr() as *const u8 as *const __m128i);
-        u8x16 { vector: v }
-    }
-
-    #[inline]
-    pub fn shuffle(self, indices: u8x16) -> u8x16 {
-        // Safe because we know SSSE3 is enabled.
-        unsafe {
-            u8x16 { vector: _mm_shuffle_epi8(self.vector, indices.vector) }
-        }
-    }
-
-    #[inline]
-    pub fn ne(self, other: u8x16) -> u8x16 {
-        // Safe because we know SSSE3 is enabled.
-        unsafe {
-            let boolv = _mm_cmpeq_epi8(self.vector, other.vector);
-            let ones = _mm_set1_epi8(0xFF as u8 as i8);
-            u8x16 { vector: _mm_andnot_si128(boolv, ones) }
-        }
-    }
-
-    #[inline]
-    pub fn and(self, other: u8x16) -> u8x16 {
-        // Safe because we know SSSE3 is enabled.
-        unsafe { u8x16 { vector: _mm_and_si128(self.vector, other.vector) } }
-    }
-
-    #[inline]
-    pub fn movemask(self) -> u32 {
-        // Safe because we know SSSE3 is enabled.
-        unsafe { _mm_movemask_epi8(self.vector) as u32 }
-    }
-
-    #[inline]
-    pub fn alignr_14(self, other: u8x16) -> u8x16 {
-        // Safe because we know SSSE3 is enabled.
-        unsafe {
-            u8x16 { vector: _mm_alignr_epi8(self.vector, other.vector, 14) }
-        }
-    }
-
-    #[inline]
-    pub fn alignr_15(self, other: u8x16) -> u8x16 {
-        // Safe because we know SSSE3 is enabled.
-        unsafe {
-            u8x16 { vector: _mm_alignr_epi8(self.vector, other.vector, 15) }
-        }
-    }
-
-    #[inline]
-    pub fn bit_shift_right_4(self) -> u8x16 {
-        // Safe because we know SSSE3 is enabled.
-        unsafe { u8x16 { vector: _mm_srli_epi16(self.vector, 4) } }
-    }
-
-    #[inline]
-    pub fn bytes(self) -> [u8; 16] {
-        // Safe because __m128i and [u8; 16] are layout compatible
-        unsafe { mem::transmute(self) }
-    }
-
-    #[inline]
-    pub fn replace_bytes(&mut self, value: [u8; 16]) {
-        // Safe because __m128i and [u8; 16] are layout compatible
-        self.vector = unsafe { mem::transmute(value) };
-    }
-}
-
-impl fmt::Debug for u8x16 {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        self.bytes().fmt(f)
-    }
-}