From 6b869838f107d4c92bd80c0cab3543131f406a73 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 3 Aug 2019 09:58:49 -0400 Subject: [PATCH] literal: remove teddy The Teddy algorithm has been moved to the aho-corasick crate (as of 0.7.5), so we can now use it there. Note that we do explicitly use aho-corasick's `packed` module instead of relying on `AhoCorasick`'s prefilter to do it for us. The reasoning, unfortunately, is that using Teddy inside of `AhoCorasick` has some measurable overhead that we'd like to avoid. It would be better to figure out how to remove that overhead, but I was unsuccessful. It isn't much additional work to reach around and used the packed search directly. Benchmarks roughly stay the same, with the `regexdna::variant` benchmarks improving across the board by about 1.5x-2x. Some benchmarks do regress (e.g., `sherlock::the_nocase`), but we decide to live with it for now. The regression is likely due to subtle changes in Teddy's bucket allocation. --- Cargo.toml | 2 +- src/lib.rs | 1 - src/{literal/mod.rs => literal.rs} | 92 +--- src/literal/teddy_avx2/fallback.rs | 32 -- src/literal/teddy_avx2/imp.rs | 478 ----------------- src/literal/teddy_avx2/mod.rs | 8 - src/literal/teddy_ssse3/fallback.rs | 32 -- src/literal/teddy_ssse3/imp.rs | 785 ---------------------------- src/literal/teddy_ssse3/mod.rs | 8 - src/vector/avx2.rs | 183 ------- src/vector/mod.rs | 4 - src/vector/ssse3.rs | 186 ------- 12 files changed, 29 insertions(+), 1782 deletions(-) rename src/{literal/mod.rs => literal.rs} (92%) delete mode 100644 src/literal/teddy_avx2/fallback.rs delete mode 100644 src/literal/teddy_avx2/imp.rs delete mode 100644 src/literal/teddy_avx2/mod.rs delete mode 100644 src/literal/teddy_ssse3/fallback.rs delete mode 100644 src/literal/teddy_ssse3/imp.rs delete mode 100644 src/literal/teddy_ssse3/mod.rs delete mode 100644 src/vector/avx2.rs delete mode 100644 src/vector/mod.rs delete mode 100644 src/vector/ssse3.rs diff --git a/Cargo.toml b/Cargo.toml index a58f257ee0..c92e36f4be 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,7 +26,7 @@ members = [ [dependencies] # For very fast prefix literal matching. -aho-corasick = "0.7.4" +aho-corasick = "0.7.6" # For skipping along search text quickly when a leading byte is known. memchr = "2.2.1" # For managing regex caches quickly across multiple threads. diff --git a/src/lib.rs b/src/lib.rs index daaa9885da..b46179f095 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -659,7 +659,6 @@ mod re_trait; mod re_unicode; mod sparse; mod utf8; -mod vector; /// The `internal` module exists to support suspicious activity, such as /// testing different matching engines and supporting the `regex-debug` CLI diff --git a/src/literal/mod.rs b/src/literal.rs similarity index 92% rename from src/literal/mod.rs rename to src/literal.rs index 64123b6441..ae405cbf4a 100644 --- a/src/literal/mod.rs +++ b/src/literal.rs @@ -1,17 +1,12 @@ use std::cmp; use std::mem; -use aho_corasick::{self, AhoCorasick, AhoCorasickBuilder}; +use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder}; use memchr::{memchr, memchr2, memchr3}; use syntax::hir::literal::{Literal, Literals}; -use self::teddy_avx2::Teddy as TeddyAVX2; -use self::teddy_ssse3::Teddy as TeddySSSE3; use freqs::BYTE_FREQUENCIES; -mod teddy_avx2; -mod teddy_ssse3; - /// A prefix extracted from a compiled regular expression. /// /// A regex prefix is a set of literal strings that *must* be matched at the @@ -37,12 +32,13 @@ enum Matcher { BoyerMoore(BoyerMooreSearch), /// An Aho-Corasick automaton. AC { ac: AhoCorasick, lits: Vec }, - /// A simd accelerated multiple string matcher. Used only for a small - /// number of small literals. - TeddySSSE3(TeddySSSE3), - /// A simd accelerated multiple string matcher. Used only for a small - /// number of small literals. This uses 256-bit vectors. - TeddyAVX2(TeddyAVX2), + /// A packed multiple substring searcher, using SIMD. + /// + /// Note that Aho-Corasick will actually use this packed searcher + /// internally automatically, however, there is some overhead associated + /// with going through the Aho-Corasick machinery. So using the packed + /// searcher directly results in some gains. + Packed { s: packed::Searcher, lits: Vec }, } impl LiteralSearcher { @@ -95,8 +91,9 @@ impl LiteralSearcher { AC { ref ac, .. } => { ac.find(haystack).map(|m| (m.start(), m.end())) } - TeddySSSE3(ref t) => t.find(haystack).map(|m| (m.start, m.end)), - TeddyAVX2(ref t) => t.find(haystack).map(|m| (m.start, m.end)), + Packed { ref s, .. } => { + s.find(haystack).map(|m| (m.start(), m.end())) + } } } @@ -134,12 +131,7 @@ impl LiteralSearcher { Matcher::FreqyPacked(ref s) => LiteralIter::Single(&s.pat), Matcher::BoyerMoore(ref s) => LiteralIter::Single(&s.pattern), Matcher::AC { ref lits, .. } => LiteralIter::AC(lits), - Matcher::TeddySSSE3(ref ted) => { - LiteralIter::TeddySSSE3(ted.patterns()) - } - Matcher::TeddyAVX2(ref ted) => { - LiteralIter::TeddyAVX2(ted.patterns()) - } + Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits), } } @@ -167,8 +159,7 @@ impl LiteralSearcher { FreqyPacked(_) => 1, BoyerMoore(_) => 1, AC { ref ac, .. } => ac.pattern_count(), - TeddySSSE3(ref ted) => ted.len(), - TeddyAVX2(ref ted) => ted.len(), + Packed { ref lits, .. } => lits.len(), } } @@ -181,8 +172,7 @@ impl LiteralSearcher { FreqyPacked(ref single) => single.approximate_size(), BoyerMoore(ref single) => single.approximate_size(), AC { ref ac, .. } => ac.heap_bytes(), - TeddySSSE3(ref ted) => ted.approximate_size(), - TeddyAVX2(ref ted) => ted.approximate_size(), + Packed { ref s, .. } => s.heap_bytes(), } } } @@ -222,34 +212,17 @@ impl Matcher { return Matcher::FreqyPacked(FreqyPacked::new(lit)); } } - let is_aho_corasick_fast = sset.dense.len() == 1 && sset.all_ascii; - if TeddyAVX2::available() && !is_aho_corasick_fast { - const MAX_TEDDY_LITERALS: usize = 32; - if lits.literals().len() <= MAX_TEDDY_LITERALS { - if let Some(ted) = TeddyAVX2::new(lits) { - return Matcher::TeddyAVX2(ted); - } - } - } - if TeddySSSE3::available() && !is_aho_corasick_fast { - // Only try Teddy if Aho-Corasick can't use memchr on an ASCII - // byte. Also, in its current form, Teddy doesn't scale well to - // lots of literals. - // - // We impose the ASCII restriction since an alternation of - // non-ASCII string literals in the same language is likely to all - // start with the same byte. Even worse, the corpus being searched - // probably has a similar composition, which ends up completely - // negating the benefit of memchr. - const MAX_TEDDY_LITERALS: usize = 32; - if lits.literals().len() <= MAX_TEDDY_LITERALS { - if let Some(ted) = TeddySSSE3::new(lits) { - return Matcher::TeddySSSE3(ted); - } + + let pats = lits.literals().to_owned(); + let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii; + if lits.literals().len() <= 100 && !is_aho_corasick_fast { + let mut builder = packed::Config::new() + .match_kind(packed::MatchKind::LeftmostFirst) + .builder(); + if let Some(s) = builder.extend(&pats).build() { + return Matcher::Packed { s, lits: pats }; } - // Fallthrough to ol' reliable Aho-Corasick... } - let pats = lits.literals().to_owned(); let ac = AhoCorasickBuilder::new() .match_kind(aho_corasick::MatchKind::LeftmostFirst) .dfa(true) @@ -264,8 +237,7 @@ pub enum LiteralIter<'a> { Bytes(&'a [u8]), Single(&'a [u8]), AC(&'a [Literal]), - TeddySSSE3(&'a [Vec]), - TeddyAVX2(&'a [Vec]), + Packed(&'a [Literal]), } impl<'a> Iterator for LiteralIter<'a> { @@ -301,16 +273,7 @@ impl<'a> Iterator for LiteralIter<'a> { Some(&**next) } } - LiteralIter::TeddySSSE3(ref mut lits) => { - if lits.is_empty() { - None - } else { - let next = &lits[0]; - *lits = &lits[1..]; - Some(&**next) - } - } - LiteralIter::TeddyAVX2(ref mut lits) => { + LiteralIter::Packed(ref mut lits) => { if lits.is_empty() { None } else { @@ -809,8 +772,9 @@ impl BoyerMooreSearch { if window_end - window_end_snapshot > 16 * mem::size_of::() { - // Returning a window_end >= backstop will immediatly - // break us out of the inner loop in `find`. + // Returning a window_end >= backstop will + // immediatly break us out of the inner loop in + // `find`. if window_end >= backstop { return Some(window_end); } diff --git a/src/literal/teddy_avx2/fallback.rs b/src/literal/teddy_avx2/fallback.rs deleted file mode 100644 index 953895abc5..0000000000 --- a/src/literal/teddy_avx2/fallback.rs +++ /dev/null @@ -1,32 +0,0 @@ -use syntax::hir::literal::Literals; - -#[derive(Debug, Clone)] -pub struct Teddy(()); - -#[derive(Debug, Clone)] -pub struct Match { - pub pat: usize, - pub start: usize, - pub end: usize, -} - -impl Teddy { - pub fn available() -> bool { - false - } - pub fn new(_pats: &Literals) -> Option { - None - } - pub fn patterns(&self) -> &[Vec] { - &[] - } - pub fn len(&self) -> usize { - 0 - } - pub fn approximate_size(&self) -> usize { - 0 - } - pub fn find(&self, _haystack: &[u8]) -> Option { - None - } -} diff --git a/src/literal/teddy_avx2/imp.rs b/src/literal/teddy_avx2/imp.rs deleted file mode 100644 index 1ce3adfa52..0000000000 --- a/src/literal/teddy_avx2/imp.rs +++ /dev/null @@ -1,478 +0,0 @@ -/*! -This is the Teddy searcher, but ported to AVX2. - -See the module comments in the SSSE3 Teddy searcher for a more in depth -explanation of how this algorithm works. For the most part, this port is -basically the same as the SSSE3 version, but using 256-bit vectors instead of -128-bit vectors, which increases throughput. -*/ - -use std::cmp; - -use aho_corasick::{self, AhoCorasick, AhoCorasickBuilder}; -use syntax::hir::literal::Literals; - -use vector::avx2::{u8x32, AVX2VectorBuilder}; - -/// Corresponds to the number of bytes read at a time in the haystack. -const BLOCK_SIZE: usize = 32; - -/// Match reports match information. -#[derive(Debug, Clone)] -pub struct Match { - /// The index of the pattern that matched. The index is in correspondence - /// with the order of the patterns given at construction. - pub pat: usize, - /// The start byte offset of the match. - pub start: usize, - /// The end byte offset of the match. This is always `start + pat.len()`. - pub end: usize, -} - -/// A SIMD accelerated multi substring searcher. -#[derive(Debug, Clone)] -pub struct Teddy { - /// A builder for AVX2 empowered vectors. - vb: AVX2VectorBuilder, - /// A list of substrings to match. - pats: Vec>, - /// An Aho-Corasick automaton of the patterns. We use this when we need to - /// search pieces smaller than the Teddy block size. - ac: AhoCorasick, - /// A set of 8 buckets. Each bucket corresponds to a single member of a - /// bitset. A bucket contains zero or more substrings. This is useful - /// when the number of substrings exceeds 8, since our bitsets cannot have - /// more than 8 members. - buckets: Vec>, - /// Our set of masks. There's one mask for each byte in the fingerprint. - masks: Masks, -} - -impl Teddy { - /// Returns true if and only if Teddy is supported on this platform. - /// - /// If this returns `false`, then `Teddy::new(...)` is guaranteed to - /// return `None`. - pub fn available() -> bool { - AVX2VectorBuilder::new().is_some() - } - - /// Create a new `Teddy` multi substring matcher. - /// - /// If a `Teddy` matcher could not be created (e.g., `pats` is empty or has - /// an empty substring), then `None` is returned. - pub fn new(pats: &Literals) -> Option { - let vb = match AVX2VectorBuilder::new() { - None => return None, - Some(vb) => vb, - }; - if !Teddy::available() { - return None; - } - - let pats: Vec<_> = - pats.literals().iter().map(|p| p.to_vec()).collect(); - let min_len = pats.iter().map(|p| p.len()).min().unwrap_or(0); - // Don't allow any empty patterns and require that we have at - // least one pattern. - if min_len < 1 { - return None; - } - // Pick the largest mask possible, but no larger than 3. - let nmasks = cmp::min(3, min_len); - let mut masks = Masks::new(vb, nmasks); - let mut buckets = vec![vec![]; 8]; - // Assign a substring to each bucket, and add the bucket's bitfield to - // the appropriate position in the mask. - for (pati, pat) in pats.iter().enumerate() { - let bucket = pati % 8; - buckets[bucket].push(pati); - masks.add(bucket as u8, pat); - } - let ac = AhoCorasickBuilder::new() - .match_kind(aho_corasick::MatchKind::LeftmostFirst) - .dfa(true) - .prefilter(false) - .build(&pats); - Some(Teddy { - vb: vb, - pats: pats.to_vec(), - ac: ac, - buckets: buckets, - masks: masks, - }) - } - - /// Returns all of the substrings matched by this `Teddy`. - pub fn patterns(&self) -> &[Vec] { - &self.pats - } - - /// Returns the number of substrings in this matcher. - pub fn len(&self) -> usize { - self.pats.len() - } - - /// Returns the approximate size on the heap used by this matcher. - pub fn approximate_size(&self) -> usize { - self.pats.iter().fold(0, |a, b| a + b.len()) - } - - /// Searches `haystack` for the substrings in this `Teddy`. If a match was - /// found, then it is returned. Otherwise, `None` is returned. - pub fn find(&self, haystack: &[u8]) -> Option { - // This is safe because the only way we can construct a Teddy type - // is if AVX2 is available. - unsafe { self.find_impl(haystack) } - } - - #[allow(unused_attributes)] - #[target_feature(enable = "avx2")] - unsafe fn find_impl(&self, haystack: &[u8]) -> Option { - // If our haystack is smaller than the block size, then fall back to - // a naive brute force search. - if haystack.is_empty() || haystack.len() < (BLOCK_SIZE + 2) { - return self.slow(haystack, 0); - } - match self.masks.len() { - 0 => None, - 1 => self.find1(haystack), - 2 => self.find2(haystack), - 3 => self.find3(haystack), - _ => unreachable!(), - } - } - - /// `find1` is used when there is only 1 mask. This is the easy case and is - /// pretty much as described in the module documentation. - #[inline(always)] - fn find1(&self, haystack: &[u8]) -> Option { - let mut pos = 0; - let zero = self.vb.u8x32_splat(0); - let len = haystack.len(); - debug_assert!(len >= BLOCK_SIZE); - while pos <= len - BLOCK_SIZE { - let h = unsafe { - // I tried and failed to eliminate bounds checks in safe code. - // This is safe because of our loop invariant: pos is always - // <= len-32. - let p = haystack.get_unchecked(pos..); - self.vb.u8x32_load_unchecked_unaligned(p) - }; - // N.B. `res0` is our `C` in the module documentation. - let res0 = self.masks.members1(h); - // Only do expensive verification if there are any non-zero bits. - let bitfield = res0.ne(zero).movemask(); - if bitfield != 0 { - if let Some(m) = self.verify(haystack, pos, res0, bitfield) { - return Some(m); - } - } - pos += BLOCK_SIZE; - } - self.slow(haystack, pos) - } - - /// `find2` is used when there are 2 masks, e.g., the fingerprint is 2 bytes - /// long. - #[inline(always)] - fn find2(&self, haystack: &[u8]) -> Option { - // This is an exotic way to right shift a SIMD vector across lanes. - // See below at use for more details. - let zero = self.vb.u8x32_splat(0); - let len = haystack.len(); - // The previous value of `C` (from the module documentation) for the - // *first* byte in the fingerprint. On subsequent iterations, we take - // the last bitset from the previous `C` and insert it into the first - // position of the current `C`, shifting all other bitsets to the right - // one lane. This causes `C` for the first byte to line up with `C` for - // the second byte, so that they can be `AND`'d together. - let mut prev0 = self.vb.u8x32_splat(0xFF); - let mut pos = 1; - debug_assert!(len >= BLOCK_SIZE); - while pos <= len - BLOCK_SIZE { - let h = unsafe { - // I tried and failed to eliminate bounds checks in safe code. - // This is safe because of our loop invariant: pos is always - // <= len-32. - let p = haystack.get_unchecked(pos..); - self.vb.u8x32_load_unchecked_unaligned(p) - }; - let (res0, res1) = self.masks.members2(h); - - // Do this: - // - // (prev0 << 15) | (res0 >> 1) - // - // This lets us line up our C values for each byte. - let res0prev0 = res0.alignr_15(prev0); - - // `AND`'s our `C` values together. - let res = res0prev0.and(res1); - prev0 = res0; - - let bitfield = res.ne(zero).movemask(); - if bitfield != 0 { - let pos = pos.checked_sub(1).unwrap(); - if let Some(m) = self.verify(haystack, pos, res, bitfield) { - return Some(m); - } - } - pos += BLOCK_SIZE; - } - // The windowing above doesn't check the last byte in the last - // window, so start the slow search at the last byte of the last - // window. - self.slow(haystack, pos.checked_sub(1).unwrap()) - } - - /// `find3` is used when there are 3 masks, e.g., the fingerprint is 3 bytes - /// long. - /// - /// N.B. This is a straight-forward extrapolation of `find2`. The only - /// difference is that we need to keep track of two previous values of `C`, - /// since we now need to align for three bytes. - #[inline(always)] - fn find3(&self, haystack: &[u8]) -> Option { - let zero = self.vb.u8x32_splat(0); - let len = haystack.len(); - let mut prev0 = self.vb.u8x32_splat(0xFF); - let mut prev1 = self.vb.u8x32_splat(0xFF); - let mut pos = 2; - - while pos <= len - BLOCK_SIZE { - let h = unsafe { - // I tried and failed to eliminate bounds checks in safe code. - // This is safe because of our loop invariant: pos is always - // <= len-32. - let p = haystack.get_unchecked(pos..); - self.vb.u8x32_load_unchecked_unaligned(p) - }; - let (res0, res1, res2) = self.masks.members3(h); - - let res0prev0 = res0.alignr_14(prev0); - let res1prev1 = res1.alignr_15(prev1); - let res = res0prev0.and(res1prev1).and(res2); - - prev0 = res0; - prev1 = res1; - - let bitfield = res.ne(zero).movemask(); - if bitfield != 0 { - let pos = pos.checked_sub(2).unwrap(); - if let Some(m) = self.verify(haystack, pos, res, bitfield) { - return Some(m); - } - } - pos += BLOCK_SIZE; - } - // The windowing above doesn't check the last two bytes in the last - // window, so start the slow search at the penultimate byte of the - // last window. - // self.slow(haystack, pos.saturating_sub(2)) - self.slow(haystack, pos.checked_sub(2).unwrap()) - } - - /// Runs the verification procedure on `res` (i.e., `C` from the module - /// documentation), where the haystack block starts at `pos` in - /// `haystack`. `bitfield` has ones in the bit positions that `res` has - /// non-zero bytes. - /// - /// If a match exists, it returns the first one. - #[inline(always)] - fn verify( - &self, - haystack: &[u8], - pos: usize, - res: u8x32, - mut bitfield: u32, - ) -> Option { - let patterns = res.bytes(); - while bitfield != 0 { - // The next offset, relative to pos, where some fingerprint - // matched. - let byte_pos = bitfield.trailing_zeros() as usize; - bitfield &= !(1 << byte_pos); - - // Offset relative to the beginning of the haystack. - let start = pos + byte_pos; - - // The bitfield telling us which patterns had fingerprints that - // match at this starting position. - let mut patterns = patterns[byte_pos]; - while patterns != 0 { - let bucket = patterns.trailing_zeros() as usize; - patterns &= !(1 << bucket); - - // Actual substring search verification. - if let Some(m) = self.verify_bucket(haystack, bucket, start) { - return Some(m); - } - } - } - - None - } - - /// Verifies whether any substring in the given bucket matches in haystack - /// at the given starting position. - #[inline(always)] - fn verify_bucket( - &self, - haystack: &[u8], - bucket: usize, - start: usize, - ) -> Option { - // This cycles through the patterns in the bucket in the order that - // the patterns were given. Therefore, we guarantee leftmost-first - // semantics. - for &pati in &self.buckets[bucket] { - let pat = &*self.pats[pati]; - if start + pat.len() > haystack.len() { - continue; - } - if pat == &haystack[start..start + pat.len()] { - return Some(Match { - pat: pati, - start: start, - end: start + pat.len(), - }); - } - } - None - } - - /// Slow substring search through all patterns in this matcher. - /// - /// This is used when we don't have enough bytes in the haystack for our - /// block based approach. - #[inline(never)] - fn slow(&self, haystack: &[u8], pos: usize) -> Option { - self.ac.find(&haystack[pos..]).map(|m| Match { - pat: m.pattern(), - start: pos + m.start(), - end: pos + m.end(), - }) - } -} - -/// A list of masks. This has length equal to the length of the fingerprint. -/// The length of the fingerprint is always `min(3, len(smallest_substring))`. -#[derive(Debug, Clone)] -struct Masks { - vb: AVX2VectorBuilder, - masks: [Mask; 3], - size: usize, -} - -impl Masks { - /// Create a new set of masks of size `n`, where `n` corresponds to the - /// number of bytes in a fingerprint. - fn new(vb: AVX2VectorBuilder, n: usize) -> Masks { - Masks { - vb: vb, - masks: [Mask::new(vb), Mask::new(vb), Mask::new(vb)], - size: n, - } - } - - /// Returns the number of masks. - fn len(&self) -> usize { - self.size - } - - /// Adds the given pattern to the given bucket. The bucket should be a - /// power of `2 <= 2^7`. - fn add(&mut self, bucket: u8, pat: &[u8]) { - for i in 0..self.len() { - self.masks[i].add(bucket, pat[i]); - } - } - - /// Finds the fingerprints that are in the given haystack block. i.e., this - /// returns `C` as described in the module documentation. - /// - /// More specifically, `for i in 0..16` and `j in 0..8, C[i][j] == 1` if and - /// only if `haystack_block[i]` corresponds to a fingerprint that is part - /// of a pattern in bucket `j`. - #[inline(always)] - fn members1(&self, haystack_block: u8x32) -> u8x32 { - let masklo = self.vb.u8x32_splat(0xF); - let hlo = haystack_block.and(masklo); - let hhi = haystack_block.bit_shift_right_4().and(masklo); - - self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi)) - } - - /// Like members1, but computes C for the first and second bytes in the - /// fingerprint. - #[inline(always)] - fn members2(&self, haystack_block: u8x32) -> (u8x32, u8x32) { - let masklo = self.vb.u8x32_splat(0xF); - let hlo = haystack_block.and(masklo); - let hhi = haystack_block.bit_shift_right_4().and(masklo); - - let res0 = - self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi)); - let res1 = - self.masks[1].lo.shuffle(hlo).and(self.masks[1].hi.shuffle(hhi)); - (res0, res1) - } - - /// Like `members1`, but computes `C` for the first, second and third bytes - /// in the fingerprint. - #[inline(always)] - fn members3(&self, haystack_block: u8x32) -> (u8x32, u8x32, u8x32) { - let masklo = self.vb.u8x32_splat(0xF); - let hlo = haystack_block.and(masklo); - let hhi = haystack_block.bit_shift_right_4().and(masklo); - - let res0 = - self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi)); - let res1 = - self.masks[1].lo.shuffle(hlo).and(self.masks[1].hi.shuffle(hhi)); - let res2 = - self.masks[2].lo.shuffle(hlo).and(self.masks[2].hi.shuffle(hhi)); - (res0, res1, res2) - } -} - -/// A single mask. -#[derive(Debug, Clone, Copy)] -struct Mask { - /// Bitsets for the low nybbles in a fingerprint. - lo: u8x32, - /// Bitsets for the high nybbles in a fingerprint. - hi: u8x32, -} - -impl Mask { - /// Create a new mask with no members. - fn new(vb: AVX2VectorBuilder) -> Mask { - Mask { lo: vb.u8x32_splat(0), hi: vb.u8x32_splat(0) } - } - - /// Adds the given byte to the given bucket. - fn add(&mut self, bucket: u8, byte: u8) { - // Split our byte into two nybbles, and add each nybble to our - // mask. - let byte_lo = (byte & 0xF) as usize; - let byte_hi = (byte >> 4) as usize; - - { - let mut lo_bytes = self.lo.bytes(); - let lo = lo_bytes[byte_lo] | ((1 << bucket) as u8); - lo_bytes[byte_lo] = lo; - lo_bytes[byte_lo + 16] = lo; - self.lo.replace_bytes(lo_bytes); - } - - { - let mut hi_bytes = self.hi.bytes(); - let hi = hi_bytes[byte_hi] | ((1 << bucket) as u8); - hi_bytes[byte_hi] = hi; - hi_bytes[byte_hi + 16] = hi; - self.hi.replace_bytes(hi_bytes); - } - } -} diff --git a/src/literal/teddy_avx2/mod.rs b/src/literal/teddy_avx2/mod.rs deleted file mode 100644 index 6930ed5028..0000000000 --- a/src/literal/teddy_avx2/mod.rs +++ /dev/null @@ -1,8 +0,0 @@ -pub use self::imp::*; - -#[cfg(target_arch = "x86_64")] -mod imp; - -#[cfg(not(target_arch = "x86_64"))] -#[path = "fallback.rs"] -mod imp; diff --git a/src/literal/teddy_ssse3/fallback.rs b/src/literal/teddy_ssse3/fallback.rs deleted file mode 100644 index 953895abc5..0000000000 --- a/src/literal/teddy_ssse3/fallback.rs +++ /dev/null @@ -1,32 +0,0 @@ -use syntax::hir::literal::Literals; - -#[derive(Debug, Clone)] -pub struct Teddy(()); - -#[derive(Debug, Clone)] -pub struct Match { - pub pat: usize, - pub start: usize, - pub end: usize, -} - -impl Teddy { - pub fn available() -> bool { - false - } - pub fn new(_pats: &Literals) -> Option { - None - } - pub fn patterns(&self) -> &[Vec] { - &[] - } - pub fn len(&self) -> usize { - 0 - } - pub fn approximate_size(&self) -> usize { - 0 - } - pub fn find(&self, _haystack: &[u8]) -> Option { - None - } -} diff --git a/src/literal/teddy_ssse3/imp.rs b/src/literal/teddy_ssse3/imp.rs deleted file mode 100644 index ac7ed11e9d..0000000000 --- a/src/literal/teddy_ssse3/imp.rs +++ /dev/null @@ -1,785 +0,0 @@ -/*! -Teddy is a simd accelerated multiple substring matching algorithm. The name -and the core ideas in the algorithm were learned from the [Hyperscan][1_u] -project. - - -Background ----------- - -The key idea of Teddy is to do *packed* substring matching. In the literature, -packed substring matching is the idea of examining multiple bytes in a haystack -at a time to detect matches. Implementations of, for example, memchr (which -detects matches of a single byte) have been doing this for years. Only -recently, with the introduction of various SIMD instructions, has this been -extended to substring matching. The PCMPESTRI instruction (and its relatives), -for example, implements substring matching in hardware. It is, however, limited -to substrings of length 16 bytes or fewer, but this restriction is fine in a -regex engine, since we rarely care about the performance difference between -searching for a 16 byte literal and a 16 + N literal; 16 is already long -enough. The key downside of the PCMPESTRI instruction, on current (2016) CPUs -at least, is its latency and throughput. As a result, it is often faster to do -substring search with a Boyer-Moore variant and a well placed memchr to quickly -skip through the haystack. - -There are fewer results from the literature on packed substring matching, -and even fewer for packed multiple substring matching. Ben-Kiki et al. [2] -describes use of PCMPESTRI for substring matching, but is mostly theoretical -and hand-waves performance. There is other theoretical work done by Bille [3] -as well. - -The rest of the work in the field, as far as I'm aware, is by Faro and Kulekci -and is generally focused on multiple pattern search. Their first paper [4a] -introduces the concept of a fingerprint, which is computed for every block of -N bytes in every pattern. The haystack is then scanned N bytes at a time and -a fingerprint is computed in the same way it was computed for blocks in the -patterns. If the fingerprint corresponds to one that was found in a pattern, -then a verification step follows to confirm that one of the substrings with the -corresponding fingerprint actually matches at the current location. Various -implementation tricks are employed to make sure the fingerprint lookup is fast; -typically by truncating the fingerprint. (This may, of course, provoke more -steps in the verification process, so a balance must be struck.) - -The main downside of [4a] is that the minimum substring length is 32 bytes, -presumably because of how the algorithm uses certain SIMD instructions. This -essentially makes it useless for general purpose regex matching, where a small -number of short patterns is far more likely. - -Faro and Kulekci published another paper [4b] that is conceptually very similar -to [4a]. The key difference is that it uses the CRC32 instruction (introduced -as part of SSE 4.2) to compute fingerprint values. This also enables the -algorithm to work effectively on substrings as short as 7 bytes with 4 byte -windows. 7 bytes is unfortunately still too long. The window could be -technically shrunk to 2 bytes, thereby reducing minimum length to 3, but the -small window size ends up negating most performance benefits—and it's likely -the common case in a general purpose regex engine. - -Faro and Kulekci also published [4c] that appears to be intended as a -replacement to using PCMPESTRI. In particular, it is specifically motivated by -the high throughput/latency time of PCMPESTRI and therefore chooses other SIMD -instructions that are faster. While this approach works for short substrings, -I personally couldn't see a way to generalize it to multiple substring search. - -Faro and Kulekci have another paper [4d] that I haven't been able to read -because it is behind a paywall. - - -Teddy ------ - -Finally, we get to Teddy. If the above literature review is complete, then it -appears that Teddy is a novel algorithm. More than that, in my experience, it -completely blows away the competition for short substrings, which is exactly -what we want in a general purpose regex engine. Again, the algorithm appears -to be developed by the authors of [Hyperscan][1_u]. Hyperscan was open sourced -late 2015, and no earlier history could be found. Therefore, tracking the exact -provenance of the algorithm with respect to the published literature seems -difficult. - -DISCLAIMER: My understanding of Teddy is limited to reading auto-generated C -code, its disassembly and observing its runtime behavior. - -At a high level, Teddy works somewhat similarly to the fingerprint algorithms -published by Faro and Kulekci, but Teddy does it in a way that scales a bit -better. Namely: - -1. Teddy's core algorithm scans the haystack in 16 byte chunks. 16 is - significant because it corresponds to the number of bytes in a SIMD vector. - If one used AVX2 instructions, then we could scan the haystack in 32 byte - chunks. Similarly, if one used AVX512 instructions, we could scan the - haystack in 64 byte chunks. Hyperscan implements SSE + AVX2, we only - implement SSE for the moment. -2. Bitwise operations are performed on each chunk to discover if any region of - it matches a set of precomputed fingerprints from the patterns. If there are - matches, then a verification step is performed. In this implementation, our - verification step is naive. This can be improved upon. - -The details to make this work are quite clever. First, we must choose how to -pick our fingerprints. In Hyperscan's implementation, I *believe* they use the -last N bytes of each substring, where N must be at least the minimum length of -any substring in the set being searched. In this implementation, we use the -first N bytes of each substring. (The tradeoffs between these choices aren't -yet clear to me.) We then must figure out how to quickly test whether an -occurrence of any fingerprint from the set of patterns appears in a 16 byte -block from the haystack. To keep things simple, let's assume N = 1 and examine -some examples to motivate the approach. Here are our patterns: - -```ignore -foo -bar -baz -``` - -The corresponding fingerprints, for N = 1, are `f`, `b` and `b`. Now let's set -our 16 byte block to: - -```ignore -bat cat foo bump -xxxxxxxxxxxxxxxx -``` - -To cut to the chase, Teddy works by using bitsets. In particular, Teddy creates -a mask that allows us to quickly compute membership of a fingerprint in a 16 -byte block that also tells which pattern the fingerprint corresponds to. In -this case, our fingerprint is a single byte, so an appropriate abstraction is -a map from a single byte to a list of patterns that contain that fingerprint: - -```ignore -f |--> foo -b |--> bar, baz -``` - -Now, all we need to do is figure out how to represent this map in vector space -and use normal SIMD operations to perform a lookup. The first simplification -we can make is to represent our patterns as bit fields occupying a single -byte. This is important, because a single SIMD vector can store 16 bytes. - -```ignore -f |--> 00000001 -b |--> 00000010, 00000100 -``` - -How do we perform lookup though? It turns out that SSSE3 introduced a very cool -instruction called PSHUFB. The instruction takes two SIMD vectors, `A` and `B`, -and returns a third vector `C`. All vectors are treated as 16 8-bit integers. -`C` is formed by `C[i] = A[B[i]]`. (This is a bit of a simplification, but true -for the purposes of this algorithm. For full details, see [Intel's Intrinsics -Guide][5_u].) This essentially lets us use the values in `B` to lookup values -in `A`. - -If we could somehow cause `B` to contain our 16 byte block from the haystack, -and if `A` could contain our bitmasks, then we'd end up with something like -this for `A`: - -```ignore - 0x00 0x01 ... 0x62 ... 0x66 ... 0xFF -A = 0 0 00000110 00000001 0 -``` - -And if `B` contains our window from our haystack, we could use shuffle to take -the values from `B` and use them to look up our bitsets in `A`. But of course, -we can't do this because `A` in the above example contains 256 bytes, which -is much larger than the size of a SIMD vector. - -Nybbles to the rescue! A nybble is 4 bits. Instead of one mask to hold all of -our bitsets, we can use two masks, where one mask corresponds to the lower four -bits of our fingerprint and the other mask corresponds to the upper four bits. -So our map now looks like: - -```ignore -'f' & 0xF = 0x6 |--> 00000001 -'f' >> 4 = 0x6 |--> 00000111 -'b' & 0xF = 0x2 |--> 00000110 -'b' >> 4 = 0x6 |--> 00000111 -``` - -Notice that the bitsets for each nybble correspond to the union of all -fingerprints that contain that nybble. For example, both `f` and `b` have the -same upper 4 bits but differ on the lower 4 bits. Putting this together, we -have `A0`, `A1` and `B`, where `A0` is our mask for the lower nybble, `A1` is -our mask for the upper nybble and `B` is our 16 byte block from the haystack: - -```ignore - 0x00 0x01 0x02 0x03 ... 0x06 ... 0xF -A0 = 0 0 00000110 0 00000001 0 -A1 = 0 0 0 0 00000111 0 -B = b a t _ t p -B = 0x62 0x61 0x74 0x20 0x74 0x70 -``` - -But of course, we can't use `B` with `PSHUFB` yet, since its values are 8 bits, -and we need indexes that are at most 4 bits (corresponding to one of 16 -values). We can apply the same transformation to split `B` into lower and upper -nybbles as we did `A`. As before, `B0` corresponds to the lower nybbles and -`B1` corresponds to the upper nybbles: - -```ignore - b a t _ c a t _ f o o _ b u m p -B0 = 0x2 0x1 0x4 0x0 0x3 0x1 0x4 0x0 0x6 0xF 0xF 0x0 0x2 0x5 0xD 0x0 -B1 = 0x6 0x6 0x7 0x2 0x6 0x6 0x7 0x2 0x6 0x6 0x6 0x2 0x6 0x7 0x6 0x7 -``` - -And now we have a nice correspondence. `B0` can index `A0` and `B1` can index -`A1`. Here's what we get when we apply `C0 = PSHUFB(A0, B0)`: - -```ignore - b a ... f o ... p - A0[0x2] A0[0x1] A0[0x6] A0[0xF] A0[0x0] -C0 = 00000110 0 00000001 0 0 -``` - -And `C1 = PSHUFB(A1, B1)`: - -```ignore - b a ... f o ... p - A1[0x6] A1[0x6] A1[0x6] A1[0x6] A1[0x7] -C1 = 00000111 00000111 00000111 00000111 0 -``` - -Notice how neither one of `C0` or `C1` is guaranteed to report fully correct -results all on its own. For example, `C1` claims that `b` is a fingerprint for -the pattern `foo` (since `A1[0x6] = 00000111`), and that `o` is a fingerprint -for all of our patterns. But if we combined `C0` and `C1` with an `AND` -operation: - -```ignore - b a ... f o ... p -C = 00000110 0 00000001 0 0 -``` - -Then we now have that `C[i]` contains a bitset corresponding to the matching -fingerprints in a haystack's 16 byte block, where `i` is the `ith` byte in that -block. - -Once we have that, we can look for the position of the least significant bit -in `C`. That position, modulo `8`, gives us the pattern that the fingerprint -matches. That position, integer divided by `8`, also gives us the byte offset -that the fingerprint occurs in inside the 16 byte haystack block. Using those -two pieces of information, we can run a verification procedure that tries -to match all substrings containing that fingerprint at that position in the -haystack. - - -Implementation notes --------------------- - -The problem with the algorithm as described above is that it uses a single byte -for a fingerprint. This will work well if the fingerprints are rare in the -haystack (e.g., capital letters or special characters in normal English text), -but if the fingerprints are common, you'll wind up spending too much time in -the verification step, which effectively negate the performance benefits of -scanning 16 bytes at a time. Remember, the key to the performance of this -algorithm is to do as little work as possible per 16 bytes. - -This algorithm can be extrapolated in a relatively straight-forward way to use -larger fingerprints. That is, instead of a single byte prefix, we might use a -three byte prefix. The implementation below implements N = {1, 2, 3} and always -picks the largest N possible. The rationale is that the bigger the fingerprint, -the fewer verification steps we'll do. Of course, if N is too large, then we'll -end up doing too much on each step. - -The way to extend it is: - -1. Add a mask for each byte in the fingerprint. (Remember that each mask is - composed of two SIMD vectors.) This results in a value of `C` for each byte - in the fingerprint while searching. -2. When testing each 16 byte block, each value of `C` must be shifted so that - they are aligned. Once aligned, they should all be `AND`'d together. This - will give you only the bitsets corresponding to the full match of the - fingerprint. - -The implementation below is commented to fill in the nitty gritty details. - -References ----------- - -- **[1]** [Hyperscan on GitHub](https://github.com/01org/hyperscan), - [webpage](https://01.org/hyperscan) -- **[2a]** Ben-Kiki, O., Bille, P., Breslauer, D., Gasieniec, L., Grossi, R., - & Weimann, O. (2011). - _Optimal packed string matching_. - In LIPIcs-Leibniz International Proceedings in Informatics (Vol. 13). - Schloss Dagstuhl-Leibniz-Zentrum fuer Informatik. - DOI: 10.4230/LIPIcs.FSTTCS.2011.423. - [PDF](http://drops.dagstuhl.de/opus/volltexte/2011/3355/pdf/37.pdf). -- **[2b]** Ben-Kiki, O., Bille, P., Breslauer, D., Ga̧sieniec, L., Grossi, R., - & Weimann, O. (2014). - _Towards optimal packed string matching_. - Theoretical Computer Science, 525, 111-129. - DOI: 10.1016/j.tcs.2013.06.013. - [PDF](http://www.cs.haifa.ac.il/~oren/Publications/bpsm.pdf). -- **[3]** Bille, P. (2011). - _Fast searching in packed strings_. - Journal of Discrete Algorithms, 9(1), 49-56. - DOI: 10.1016/j.jda.2010.09.003. - [PDF](http://www.sciencedirect.com/science/article/pii/S1570866710000353). -- **[4a]** Faro, S., & Külekci, M. O. (2012, October). - _Fast multiple string matching using streaming SIMD extensions technology_. - In String Processing and Information Retrieval (pp. 217-228). - Springer Berlin Heidelberg. - DOI: 10.1007/978-3-642-34109-0_23. - [PDF](http://www.dmi.unict.it/~faro/papers/conference/faro32.pdf). -- **[4b]** Faro, S., & Külekci, M. O. (2013, September). - _Towards a Very Fast Multiple String Matching Algorithm for Short Patterns_. - In Stringology (pp. 78-91). - [PDF](http://www.dmi.unict.it/~faro/papers/conference/faro36.pdf). -- **[4c]** Faro, S., & Külekci, M. O. (2013, January). - _Fast packed string matching for short patterns_. - In Proceedings of the Meeting on Algorithm Engineering & Expermiments - (pp. 113-121). - Society for Industrial and Applied Mathematics. - [PDF](http://arxiv.org/pdf/1209.6449.pdf). -- **[4d]** Faro, S., & Külekci, M. O. (2014). - _Fast and flexible packed string matching_. - Journal of Discrete Algorithms, 28, 61-72. - DOI: 10.1016/j.jda.2014.07.003. - -[1_u]: https://github.com/01org/hyperscan -[5_u]: https://software.intel.com/sites/landingpage/IntrinsicsGuide -*/ - -use std::cmp; - -use aho_corasick::{self, AhoCorasick, AhoCorasickBuilder}; -use syntax::hir::literal::Literals; - -use vector::ssse3::{u8x16, SSSE3VectorBuilder}; - -/// Corresponds to the number of bytes read at a time in the haystack. -const BLOCK_SIZE: usize = 16; - -/// Match reports match information. -#[derive(Debug, Clone)] -pub struct Match { - /// The index of the pattern that matched. The index is in correspondence - /// with the order of the patterns given at construction. - pub pat: usize, - /// The start byte offset of the match. - pub start: usize, - /// The end byte offset of the match. This is always `start + pat.len()`. - pub end: usize, -} - -/// A SIMD accelerated multi substring searcher. -#[derive(Debug, Clone)] -pub struct Teddy { - /// A builder for SSSE3 empowered vectors. - vb: SSSE3VectorBuilder, - /// A list of substrings to match. - pats: Vec>, - /// An Aho-Corasick automaton of the patterns. We use this when we need to - /// search pieces smaller than the Teddy block size. - ac: AhoCorasick, - /// A set of 8 buckets. Each bucket corresponds to a single member of a - /// bitset. A bucket contains zero or more substrings. This is useful - /// when the number of substrings exceeds 8, since our bitsets cannot have - /// more than 8 members. - buckets: Vec>, - /// Our set of masks. There's one mask for each byte in the fingerprint. - masks: Masks, -} - -impl Teddy { - /// Returns true if and only if Teddy is supported on this platform. - /// - /// If this returns `false`, then `Teddy::new(...)` is guaranteed to - /// return `None`. - pub fn available() -> bool { - SSSE3VectorBuilder::new().is_some() - } - - /// Create a new `Teddy` multi substring matcher. - /// - /// If a `Teddy` matcher could not be created (e.g., `pats` is empty or has - /// an empty substring), then `None` is returned. - pub fn new(pats: &Literals) -> Option { - let vb = match SSSE3VectorBuilder::new() { - None => return None, - Some(vb) => vb, - }; - if !Teddy::available() { - return None; - } - - let pats: Vec<_> = - pats.literals().iter().map(|p| p.to_vec()).collect(); - let min_len = pats.iter().map(|p| p.len()).min().unwrap_or(0); - // Don't allow any empty patterns and require that we have at - // least one pattern. - if min_len < 1 { - return None; - } - // Pick the largest mask possible, but no larger than 3. - let nmasks = cmp::min(3, min_len); - let mut masks = Masks::new(vb, nmasks); - let mut buckets = vec![vec![]; 8]; - // Assign a substring to each bucket, and add the bucket's bitfield to - // the appropriate position in the mask. - for (pati, pat) in pats.iter().enumerate() { - let bucket = pati % 8; - buckets[bucket].push(pati); - masks.add(bucket as u8, pat); - } - let ac = AhoCorasickBuilder::new() - .match_kind(aho_corasick::MatchKind::LeftmostFirst) - .dfa(true) - .prefilter(false) - .build(&pats); - Some(Teddy { - vb: vb, - pats: pats.to_vec(), - ac: ac, - buckets: buckets, - masks: masks, - }) - } - - /// Returns all of the substrings matched by this `Teddy`. - pub fn patterns(&self) -> &[Vec] { - &self.pats - } - - /// Returns the number of substrings in this matcher. - pub fn len(&self) -> usize { - self.pats.len() - } - - /// Returns the approximate size on the heap used by this matcher. - pub fn approximate_size(&self) -> usize { - self.pats.iter().fold(0, |a, b| a + b.len()) - } - - /// Searches `haystack` for the substrings in this `Teddy`. If a match was - /// found, then it is returned. Otherwise, `None` is returned. - pub fn find(&self, haystack: &[u8]) -> Option { - // This is safe because the only way we can construct a Teddy type - // is if SSSE3 is available. - unsafe { self.find_impl(haystack) } - } - - #[allow(unused_attributes)] - #[target_feature(enable = "ssse3")] - unsafe fn find_impl(&self, haystack: &[u8]) -> Option { - // If our haystack is smaller than the block size, then fall back to - // a naive brute force search. - if haystack.is_empty() || haystack.len() < (BLOCK_SIZE + 2) { - return self.slow(haystack, 0); - } - match self.masks.len() { - 0 => None, - 1 => self.find1(haystack), - 2 => self.find2(haystack), - 3 => self.find3(haystack), - _ => unreachable!(), - } - } - - /// `find1` is used when there is only 1 mask. This is the easy case and is - /// pretty much as described in the module documentation. - #[inline(always)] - fn find1(&self, haystack: &[u8]) -> Option { - let mut pos = 0; - let zero = self.vb.u8x16_splat(0); - let len = haystack.len(); - debug_assert!(len >= BLOCK_SIZE); - while pos <= len - BLOCK_SIZE { - let h = unsafe { - // I tried and failed to eliminate bounds checks in safe code. - // This is safe because of our loop invariant: pos is always - // <= len-16. - let p = haystack.get_unchecked(pos..); - self.vb.u8x16_load_unchecked_unaligned(p) - }; - // N.B. `res0` is our `C` in the module documentation. - let res0 = self.masks.members1(h); - // Only do expensive verification if there are any non-zero bits. - let bitfield = res0.ne(zero).movemask(); - if bitfield != 0 { - if let Some(m) = self.verify(haystack, pos, res0, bitfield) { - return Some(m); - } - } - pos += BLOCK_SIZE; - } - self.slow(haystack, pos) - } - - /// `find2` is used when there are 2 masks, e.g., the fingerprint is 2 bytes - /// long. - #[inline(always)] - fn find2(&self, haystack: &[u8]) -> Option { - // This is an exotic way to right shift a SIMD vector across lanes. - // See below at use for more details. - let zero = self.vb.u8x16_splat(0); - let len = haystack.len(); - // The previous value of `C` (from the module documentation) for the - // *first* byte in the fingerprint. On subsequent iterations, we take - // the last bitset from the previous `C` and insert it into the first - // position of the current `C`, shifting all other bitsets to the right - // one lane. This causes `C` for the first byte to line up with `C` for - // the second byte, so that they can be `AND`'d together. - let mut prev0 = self.vb.u8x16_splat(0xFF); - let mut pos = 1; - debug_assert!(len >= BLOCK_SIZE); - while pos <= len - BLOCK_SIZE { - let h = unsafe { - // I tried and failed to eliminate bounds checks in safe code. - // This is safe because of our loop invariant: pos is always - // <= len-16. - let p = haystack.get_unchecked(pos..); - self.vb.u8x16_load_unchecked_unaligned(p) - }; - let (res0, res1) = self.masks.members2(h); - - // Do this: - // - // (prev0 << 15) | (res0 >> 1) - // - // This lets us line up our C values for each byte. - let res0prev0 = res0.alignr_15(prev0); - - // `AND`'s our `C` values together. - let res = res0prev0.and(res1); - prev0 = res0; - - let bitfield = res.ne(zero).movemask(); - if bitfield != 0 { - let pos = pos.checked_sub(1).unwrap(); - if let Some(m) = self.verify(haystack, pos, res, bitfield) { - return Some(m); - } - } - pos += BLOCK_SIZE; - } - // The windowing above doesn't check the last byte in the last - // window, so start the slow search at the last byte of the last - // window. - self.slow(haystack, pos.checked_sub(1).unwrap()) - } - - /// `find3` is used when there are 3 masks, e.g., the fingerprint is 3 bytes - /// long. - /// - /// N.B. This is a straight-forward extrapolation of `find2`. The only - /// difference is that we need to keep track of two previous values of `C`, - /// since we now need to align for three bytes. - #[inline(always)] - fn find3(&self, haystack: &[u8]) -> Option { - let zero = self.vb.u8x16_splat(0); - let len = haystack.len(); - let mut prev0 = self.vb.u8x16_splat(0xFF); - let mut prev1 = self.vb.u8x16_splat(0xFF); - let mut pos = 2; - while pos <= len - BLOCK_SIZE { - let h = unsafe { - // I tried and failed to eliminate bounds checks in safe code. - // This is safe because of our loop invariant: pos is always - // <= len-16. - let p = haystack.get_unchecked(pos..); - self.vb.u8x16_load_unchecked_unaligned(p) - }; - let (res0, res1, res2) = self.masks.members3(h); - - let res0prev0 = res0.alignr_14(prev0); - let res1prev1 = res1.alignr_15(prev1); - let res = res0prev0.and(res1prev1).and(res2); - - prev0 = res0; - prev1 = res1; - - let bitfield = res.ne(zero).movemask(); - if bitfield != 0 { - let pos = pos.checked_sub(2).unwrap(); - if let Some(m) = self.verify(haystack, pos, res, bitfield) { - return Some(m); - } - } - pos += BLOCK_SIZE; - } - // The windowing above doesn't check the last two bytes in the last - // window, so start the slow search at the penultimate byte of the - // last window. - // self.slow(haystack, pos.saturating_sub(2)) - self.slow(haystack, pos.checked_sub(2).unwrap()) - } - - /// Runs the verification procedure on `res` (i.e., `C` from the module - /// documentation), where the haystack block starts at `pos` in - /// `haystack`. `bitfield` has ones in the bit positions that `res` has - /// non-zero bytes. - /// - /// If a match exists, it returns the first one. - #[inline(always)] - fn verify( - &self, - haystack: &[u8], - pos: usize, - res: u8x16, - mut bitfield: u32, - ) -> Option { - let patterns = res.bytes(); - while bitfield != 0 { - // The next offset, relative to pos, where some fingerprint - // matched. - let byte_pos = bitfield.trailing_zeros() as usize; - bitfield &= !(1 << byte_pos); - - // Offset relative to the beginning of the haystack. - let start = pos + byte_pos; - - // The bitfield telling us which patterns had fingerprints that - // match at this starting position. - let mut patterns = patterns[byte_pos]; - while patterns != 0 { - let bucket = patterns.trailing_zeros() as usize; - patterns &= !(1 << bucket); - - // Actual substring search verification. - if let Some(m) = self.verify_bucket(haystack, bucket, start) { - return Some(m); - } - } - } - - None - } - - /// Verifies whether any substring in the given bucket matches in haystack - /// at the given starting position. - #[inline(always)] - fn verify_bucket( - &self, - haystack: &[u8], - bucket: usize, - start: usize, - ) -> Option { - // This cycles through the patterns in the bucket in the order that - // the patterns were given. Therefore, we guarantee leftmost-first - // semantics. - for &pati in &self.buckets[bucket] { - let pat = &*self.pats[pati]; - if start + pat.len() > haystack.len() { - continue; - } - if pat == &haystack[start..start + pat.len()] { - return Some(Match { - pat: pati, - start: start, - end: start + pat.len(), - }); - } - } - None - } - - /// Slow substring search through all patterns in this matcher. - /// - /// This is used when we don't have enough bytes in the haystack for our - /// block based approach. - #[inline(never)] - fn slow(&self, haystack: &[u8], pos: usize) -> Option { - self.ac.find(&haystack[pos..]).map(|m| Match { - pat: m.pattern(), - start: pos + m.start(), - end: pos + m.end(), - }) - } -} - -/// A list of masks. This has length equal to the length of the fingerprint. -/// The length of the fingerprint is always `min(3, len(smallest_substring))`. -#[derive(Debug, Clone)] -struct Masks { - vb: SSSE3VectorBuilder, - masks: [Mask; 3], - size: usize, -} - -impl Masks { - /// Create a new set of masks of size `n`, where `n` corresponds to the - /// number of bytes in a fingerprint. - fn new(vb: SSSE3VectorBuilder, n: usize) -> Masks { - Masks { - vb: vb, - masks: [Mask::new(vb), Mask::new(vb), Mask::new(vb)], - size: n, - } - } - - /// Returns the number of masks. - fn len(&self) -> usize { - self.size - } - - /// Adds the given pattern to the given bucket. The bucket should be a - /// power of `2 <= 2^7`. - fn add(&mut self, bucket: u8, pat: &[u8]) { - for i in 0..self.len() { - self.masks[i].add(bucket, pat[i]); - } - } - - /// Finds the fingerprints that are in the given haystack block. i.e., this - /// returns `C` as described in the module documentation. - /// - /// More specifically, `for i in 0..16` and `j in 0..8, C[i][j] == 1` if and - /// only if `haystack_block[i]` corresponds to a fingerprint that is part - /// of a pattern in bucket `j`. - #[inline(always)] - fn members1(&self, haystack_block: u8x16) -> u8x16 { - let masklo = self.vb.u8x16_splat(0xF); - let hlo = haystack_block.and(masklo); - let hhi = haystack_block.bit_shift_right_4().and(masklo); - - self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi)) - } - - /// Like members1, but computes C for the first and second bytes in the - /// fingerprint. - #[inline(always)] - fn members2(&self, haystack_block: u8x16) -> (u8x16, u8x16) { - let masklo = self.vb.u8x16_splat(0xF); - let hlo = haystack_block.and(masklo); - let hhi = haystack_block.bit_shift_right_4().and(masklo); - - let res0 = - self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi)); - let res1 = - self.masks[1].lo.shuffle(hlo).and(self.masks[1].hi.shuffle(hhi)); - (res0, res1) - } - - /// Like `members1`, but computes `C` for the first, second and third bytes - /// in the fingerprint. - #[inline(always)] - fn members3(&self, haystack_block: u8x16) -> (u8x16, u8x16, u8x16) { - let masklo = self.vb.u8x16_splat(0xF); - let hlo = haystack_block.and(masklo); - let hhi = haystack_block.bit_shift_right_4().and(masklo); - - let res0 = - self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi)); - let res1 = - self.masks[1].lo.shuffle(hlo).and(self.masks[1].hi.shuffle(hhi)); - let res2 = - self.masks[2].lo.shuffle(hlo).and(self.masks[2].hi.shuffle(hhi)); - (res0, res1, res2) - } -} - -/// A single mask. -#[derive(Debug, Clone, Copy)] -struct Mask { - /// Bitsets for the low nybbles in a fingerprint. - lo: u8x16, - /// Bitsets for the high nybbles in a fingerprint. - hi: u8x16, -} - -impl Mask { - /// Create a new mask with no members. - fn new(vb: SSSE3VectorBuilder) -> Mask { - Mask { lo: vb.u8x16_splat(0), hi: vb.u8x16_splat(0) } - } - - /// Adds the given byte to the given bucket. - fn add(&mut self, bucket: u8, byte: u8) { - // Split our byte into two nybbles, and add each nybble to our - // mask. - let byte_lo = (byte & 0xF) as usize; - let byte_hi = (byte >> 4) as usize; - - { - let mut lo_bytes = self.lo.bytes(); - let lo = lo_bytes[byte_lo]; - lo_bytes[byte_lo] = ((1 << bucket) as u8) | lo; - self.lo.replace_bytes(lo_bytes); - } - { - let mut hi_bytes = self.hi.bytes(); - let hi = hi_bytes[byte_hi]; - hi_bytes[byte_hi] = ((1 << bucket) as u8) | hi; - self.hi.replace_bytes(hi_bytes); - } - } -} diff --git a/src/literal/teddy_ssse3/mod.rs b/src/literal/teddy_ssse3/mod.rs deleted file mode 100644 index 6930ed5028..0000000000 --- a/src/literal/teddy_ssse3/mod.rs +++ /dev/null @@ -1,8 +0,0 @@ -pub use self::imp::*; - -#[cfg(target_arch = "x86_64")] -mod imp; - -#[cfg(not(target_arch = "x86_64"))] -#[path = "fallback.rs"] -mod imp; diff --git a/src/vector/avx2.rs b/src/vector/avx2.rs deleted file mode 100644 index 5228c60d1c..0000000000 --- a/src/vector/avx2.rs +++ /dev/null @@ -1,183 +0,0 @@ -#![allow(dead_code)] - -use std::arch::x86_64::*; -use std::fmt; -use std::mem; - -#[derive(Clone, Copy, Debug)] -pub struct AVX2VectorBuilder(()); - -impl AVX2VectorBuilder { - pub fn new() -> Option { - if is_x86_feature_detected!("avx2") { - Some(AVX2VectorBuilder(())) - } else { - None - } - } - - /// Create a new u8x32 AVX2 vector where all of the bytes are set to - /// the given value. - #[inline] - pub fn u8x32_splat(self, n: u8) -> u8x32 { - // Safe because we know AVX2 is enabled. - unsafe { u8x32::splat(n) } - } - - /// Load 32 bytes from the given slice, with bounds checks. - #[inline] - pub fn u8x32_load_unaligned(self, slice: &[u8]) -> u8x32 { - // Safe because we know AVX2 is enabled. - unsafe { u8x32::load_unaligned(slice) } - } - - /// Load 32 bytes from the given slice, without bounds checks. - #[inline] - pub unsafe fn u8x32_load_unchecked_unaligned(self, slice: &[u8]) -> u8x32 { - // Safe because we know AVX2 is enabled, but still unsafe - // because we aren't doing bounds checks. - u8x32::load_unchecked_unaligned(slice) - } - - /// Load 32 bytes from the given slice, with bound and alignment checks. - #[inline] - pub fn u8x32_load(self, slice: &[u8]) -> u8x32 { - // Safe because we know AVX2 is enabled. - unsafe { u8x32::load(slice) } - } - - /// Load 32 bytes from the given slice, without bound or alignment checks. - #[inline] - pub unsafe fn u8x32_load_unchecked(self, slice: &[u8]) -> u8x32 { - // Safe because we know AVX2 is enabled, but still unsafe - // because we aren't doing bounds checks. - u8x32::load_unchecked(slice) - } -} - -#[derive(Clone, Copy)] -#[allow(non_camel_case_types)] -#[repr(transparent)] -pub struct u8x32 { - vector: __m256i, -} - -impl u8x32 { - #[inline] - unsafe fn splat(n: u8) -> u8x32 { - u8x32 { vector: _mm256_set1_epi8(n as i8) } - } - - #[inline] - unsafe fn load_unaligned(slice: &[u8]) -> u8x32 { - assert!(slice.len() >= 32); - u8x32::load_unchecked_unaligned(slice) - } - - #[inline] - unsafe fn load_unchecked_unaligned(slice: &[u8]) -> u8x32 { - let p = slice.as_ptr() as *const u8 as *const __m256i; - u8x32 { vector: _mm256_loadu_si256(p) } - } - - #[inline] - unsafe fn load(slice: &[u8]) -> u8x32 { - assert!(slice.len() >= 32); - assert!(slice.as_ptr() as usize % 32 == 0); - u8x32::load_unchecked(slice) - } - - #[inline] - unsafe fn load_unchecked(slice: &[u8]) -> u8x32 { - let p = slice.as_ptr() as *const u8 as *const __m256i; - u8x32 { vector: _mm256_load_si256(p) } - } - - #[inline] - pub fn shuffle(self, indices: u8x32) -> u8x32 { - // Safe because we know AVX2 is enabled. - unsafe { - u8x32 { vector: _mm256_shuffle_epi8(self.vector, indices.vector) } - } - } - - #[inline] - pub fn ne(self, other: u8x32) -> u8x32 { - // Safe because we know AVX2 is enabled. - unsafe { - let boolv = _mm256_cmpeq_epi8(self.vector, other.vector); - let ones = _mm256_set1_epi8(0xFF as u8 as i8); - u8x32 { vector: _mm256_andnot_si256(boolv, ones) } - } - } - - #[inline] - pub fn and(self, other: u8x32) -> u8x32 { - // Safe because we know AVX2 is enabled. - unsafe { - u8x32 { vector: _mm256_and_si256(self.vector, other.vector) } - } - } - - #[inline] - pub fn movemask(self) -> u32 { - // Safe because we know AVX2 is enabled. - unsafe { _mm256_movemask_epi8(self.vector) as u32 } - } - - #[inline] - pub fn alignr_14(self, other: u8x32) -> u8x32 { - // Safe because we know AVX2 is enabled. - unsafe { - // Credit goes to jneem for figuring this out: - // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184 - // - // TL;DR avx2's PALIGNR instruction is actually just two 128-bit - // PALIGNR instructions, which is not what we want, so we need to - // do some extra shuffling. - let v = _mm256_permute2x128_si256(other.vector, self.vector, 0x21); - let v = _mm256_alignr_epi8(self.vector, v, 14); - u8x32 { vector: v } - } - } - - #[inline] - pub fn alignr_15(self, other: u8x32) -> u8x32 { - // Safe because we know AVX2 is enabled. - unsafe { - // Credit goes to jneem for figuring this out: - // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184 - // - // TL;DR avx2's PALIGNR instruction is actually just two 128-bit - // PALIGNR instructions, which is not what we want, so we need to - // do some extra shuffling. - let v = _mm256_permute2x128_si256(other.vector, self.vector, 0x21); - let v = _mm256_alignr_epi8(self.vector, v, 15); - u8x32 { vector: v } - } - } - - #[inline] - pub fn bit_shift_right_4(self) -> u8x32 { - // Safe because we know AVX2 is enabled. - unsafe { u8x32 { vector: _mm256_srli_epi16(self.vector, 4) } } - } - - #[inline] - pub fn bytes(self) -> [u8; 32] { - // Safe because __m256i and [u8; 32] are layout compatible - unsafe { mem::transmute(self) } - } - - #[inline] - pub fn replace_bytes(&mut self, value: [u8; 32]) { - // Safe because __m256i and [u8; 32] are layout compatible - self.vector = unsafe { mem::transmute(value) }; - } -} - -impl fmt::Debug for u8x32 { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - self.bytes().fmt(f) - } -} diff --git a/src/vector/mod.rs b/src/vector/mod.rs deleted file mode 100644 index 1dbae19a09..0000000000 --- a/src/vector/mod.rs +++ /dev/null @@ -1,4 +0,0 @@ -#[cfg(target_arch = "x86_64")] -pub mod avx2; -#[cfg(target_arch = "x86_64")] -pub mod ssse3; diff --git a/src/vector/ssse3.rs b/src/vector/ssse3.rs deleted file mode 100644 index 99e99ab87d..0000000000 --- a/src/vector/ssse3.rs +++ /dev/null @@ -1,186 +0,0 @@ -#![allow(dead_code)] - -use std::arch::x86_64::*; -use std::fmt; -use std::mem; - -/// A builder for SSSE3 empowered vectors. -/// -/// This builder represents a receipt that the SSSE3 target feature is enabled -/// on the currently running CPU. Namely, the only way to get a value of this -/// type is if the SSSE3 feature is enabled. -/// -/// This type can then be used to build vector types that use SSSE3 features -/// safely. -#[derive(Clone, Copy, Debug)] -pub struct SSSE3VectorBuilder(()); - -impl SSSE3VectorBuilder { - /// Create a new SSSE3 vector builder. - /// - /// If the SSSE3 feature is not enabled for the current target, then - /// return `None`. - pub fn new() -> Option { - if is_x86_feature_detected!("ssse3") { - Some(SSSE3VectorBuilder(())) - } else { - None - } - } - - /// Create a new u8x16 SSSE3 vector where all of the bytes are set to - /// the given value. - #[inline] - pub fn u8x16_splat(self, n: u8) -> u8x16 { - // Safe because we know SSSE3 is enabled. - unsafe { u8x16::splat(n) } - } - - /// Load 16 bytes from the given slice, with bounds checks. - #[inline] - pub fn u8x16_load_unaligned(self, slice: &[u8]) -> u8x16 { - // Safe because we know SSSE3 is enabled. - unsafe { u8x16::load_unaligned(slice) } - } - - /// Load 16 bytes from the given slice, without bounds checks. - #[inline] - pub unsafe fn u8x16_load_unchecked_unaligned(self, slice: &[u8]) -> u8x16 { - // Safe because we know SSSE3 is enabled, but still unsafe - // because we aren't doing bounds checks. - u8x16::load_unchecked_unaligned(slice) - } - - /// Load 16 bytes from the given slice, with bound and alignment checks. - #[inline] - pub fn u8x16_load(self, slice: &[u8]) -> u8x16 { - // Safe because we know SSSE3 is enabled. - unsafe { u8x16::load(slice) } - } - - /// Load 16 bytes from the given slice, without bound or alignment checks. - #[inline] - pub unsafe fn u8x16_load_unchecked(self, slice: &[u8]) -> u8x16 { - // Safe because we know SSSE3 is enabled, but still unsafe - // because we aren't doing bounds checks. - u8x16::load_unchecked(slice) - } -} - -/// A u8x16 is a 128-bit vector with 16 single-byte lanes. -/// -/// It provides a safe API that uses only SSE2 or SSSE3 instructions. -/// The only way for callers to construct a value of this type is -/// through the SSSE3VectorBuilder type, and the only way to get a -/// SSSE3VectorBuilder is if the `ssse3` target feature is enabled. -/// -/// Note that generally speaking, all uses of this type should get -/// inlined, otherwise you probably have a performance bug. -#[derive(Clone, Copy)] -#[allow(non_camel_case_types)] -#[repr(transparent)] -pub struct u8x16 { - vector: __m128i, -} - -impl u8x16 { - #[inline] - unsafe fn splat(n: u8) -> u8x16 { - u8x16 { vector: _mm_set1_epi8(n as i8) } - } - - #[inline] - unsafe fn load_unaligned(slice: &[u8]) -> u8x16 { - assert!(slice.len() >= 16); - u8x16::load_unchecked(slice) - } - - #[inline] - unsafe fn load_unchecked_unaligned(slice: &[u8]) -> u8x16 { - let v = _mm_loadu_si128(slice.as_ptr() as *const u8 as *const __m128i); - u8x16 { vector: v } - } - - #[inline] - unsafe fn load(slice: &[u8]) -> u8x16 { - assert!(slice.len() >= 16); - assert!(slice.as_ptr() as usize % 16 == 0); - u8x16::load_unchecked(slice) - } - - #[inline] - unsafe fn load_unchecked(slice: &[u8]) -> u8x16 { - let v = _mm_load_si128(slice.as_ptr() as *const u8 as *const __m128i); - u8x16 { vector: v } - } - - #[inline] - pub fn shuffle(self, indices: u8x16) -> u8x16 { - // Safe because we know SSSE3 is enabled. - unsafe { - u8x16 { vector: _mm_shuffle_epi8(self.vector, indices.vector) } - } - } - - #[inline] - pub fn ne(self, other: u8x16) -> u8x16 { - // Safe because we know SSSE3 is enabled. - unsafe { - let boolv = _mm_cmpeq_epi8(self.vector, other.vector); - let ones = _mm_set1_epi8(0xFF as u8 as i8); - u8x16 { vector: _mm_andnot_si128(boolv, ones) } - } - } - - #[inline] - pub fn and(self, other: u8x16) -> u8x16 { - // Safe because we know SSSE3 is enabled. - unsafe { u8x16 { vector: _mm_and_si128(self.vector, other.vector) } } - } - - #[inline] - pub fn movemask(self) -> u32 { - // Safe because we know SSSE3 is enabled. - unsafe { _mm_movemask_epi8(self.vector) as u32 } - } - - #[inline] - pub fn alignr_14(self, other: u8x16) -> u8x16 { - // Safe because we know SSSE3 is enabled. - unsafe { - u8x16 { vector: _mm_alignr_epi8(self.vector, other.vector, 14) } - } - } - - #[inline] - pub fn alignr_15(self, other: u8x16) -> u8x16 { - // Safe because we know SSSE3 is enabled. - unsafe { - u8x16 { vector: _mm_alignr_epi8(self.vector, other.vector, 15) } - } - } - - #[inline] - pub fn bit_shift_right_4(self) -> u8x16 { - // Safe because we know SSSE3 is enabled. - unsafe { u8x16 { vector: _mm_srli_epi16(self.vector, 4) } } - } - - #[inline] - pub fn bytes(self) -> [u8; 16] { - // Safe because __m128i and [u8; 16] are layout compatible - unsafe { mem::transmute(self) } - } - - #[inline] - pub fn replace_bytes(&mut self, value: [u8; 16]) { - // Safe because __m128i and [u8; 16] are layout compatible - self.vector = unsafe { mem::transmute(value) }; - } -} - -impl fmt::Debug for u8x16 { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - self.bytes().fmt(f) - } -}