From b890b7bbc732cd26f13c309573b5a3e45d0748de Mon Sep 17 00:00:00 2001 From: Ulrik Sverdrup Date: Sun, 14 Jun 2015 23:17:17 +0200 Subject: [PATCH 1/4] StrSearcher: Update substring search to use the Two Way algorithm To improve our substring search performance, revive the two way searcher and adapt it to the Pattern API. Fixes #25483, a performance bug: that particular case now completes faster in optimized rust than in ruby (but they share the same order of magnitude). Much thanks to @gereeter who helped me understand the reverse case better and wrote the comment explaining `next_back` in the code. I had quickcheck to fuzz test forward and reverse searching thoroughly. The two way searcher implements both forward and reverse search, but not double ended search. The forward and reverse parts of the two way searcher are completely independent. The two way searcher algorithm has very small, constant space overhead, requiring no dynamic allocation. Our implementation is relatively fast, especially due to the `byteset` addition to the algorithm, which speeds up many no-match cases. A bad case for the two way algorithm is: ``` let haystack = (0..10_000).map(|_| "dac").collect::(); let needle = (0..100).map(|_| "bac").collect::()); ``` For this particular case, two way is not much faster than the naive implementation it replaces. --- src/libcollectionstest/str.rs | 10 +- src/libcore/str/mod.rs | 299 +---------------- src/libcore/str/pattern.rs | 592 ++++++++++++++++++++++++++-------- 3 files changed, 475 insertions(+), 426 deletions(-) diff --git a/src/libcollectionstest/str.rs b/src/libcollectionstest/str.rs index 3f32136bc2639..87a018ced195a 100644 --- a/src/libcollectionstest/str.rs +++ b/src/libcollectionstest/str.rs @@ -705,7 +705,7 @@ fn test_split_at() { #[should_panic] fn test_split_at_boundscheck() { let s = "ศไทย中华Việt Nam"; - let (a, b) = s.split_at(1); + s.split_at(1); } #[test] @@ -1820,6 +1820,14 @@ mod pattern { Match (4, 6), Reject(6, 7), ]); + make_test!(str_searcher_ascii_haystack_seq, "bb", "abbcbbbbd", [ + Reject(0, 1), + Match (1, 3), + Reject(3, 4), + Match (4, 6), + Match (6, 8), + Reject(8, 9), + ]); make_test!(str_searcher_empty_needle_ascii_haystack, "", "abbcbbd", [ Match (0, 0), Reject(0, 1), diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs index 5a621176c4a81..bd6e1a4063abd 100644 --- a/src/libcore/str/mod.rs +++ b/src/libcore/str/mod.rs @@ -15,13 +15,12 @@ #![doc(primitive = "str")] #![stable(feature = "rust1", since = "1.0.0")] -use self::OldSearcher::{TwoWay, TwoWayLong}; use self::pattern::Pattern; use self::pattern::{Searcher, ReverseSearcher, DoubleEndedSearcher}; use char::CharExt; use clone::Clone; -use cmp::{self, Eq}; +use cmp::Eq; use convert::AsRef; use default::Default; use fmt; @@ -33,7 +32,6 @@ use option::Option::{self, None, Some}; use raw::{Repr, Slice}; use result::Result::{self, Ok, Err}; use slice::{self, SliceExt}; -use usize; pub mod pattern; @@ -870,301 +868,6 @@ impl<'a> DoubleEndedIterator for LinesAny<'a> { } } -/// The internal state of an iterator that searches for matches of a substring -/// within a larger string using two-way search -#[derive(Clone)] -struct TwoWaySearcher { - // constants - crit_pos: usize, - period: usize, - byteset: u64, - - // variables - position: usize, - memory: usize -} - -/* - This is the Two-Way search algorithm, which was introduced in the paper: - Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675. - - Here's some background information. - - A *word* is a string of symbols. The *length* of a word should be a familiar - notion, and here we denote it for any word x by |x|. - (We also allow for the possibility of the *empty word*, a word of length zero). - - If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a - *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p]. - For example, both 1 and 2 are periods for the string "aa". As another example, - the only period of the string "abcd" is 4. - - We denote by period(x) the *smallest* period of x (provided that x is non-empty). - This is always well-defined since every non-empty word x has at least one period, - |x|. We sometimes call this *the period* of x. - - If u, v and x are words such that x = uv, where uv is the concatenation of u and - v, then we say that (u, v) is a *factorization* of x. - - Let (u, v) be a factorization for a word x. Then if w is a non-empty word such - that both of the following hold - - - either w is a suffix of u or u is a suffix of w - - either w is a prefix of v or v is a prefix of w - - then w is said to be a *repetition* for the factorization (u, v). - - Just to unpack this, there are four possibilities here. Let w = "abc". Then we - might have: - - - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde") - - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab") - - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi") - - u is a suffix of w and v is a prefix of w. ex: ("bc", "a") - - Note that the word vu is a repetition for any factorization (u,v) of x = uv, - so every factorization has at least one repetition. - - If x is a string and (u, v) is a factorization for x, then a *local period* for - (u, v) is an integer r such that there is some word w such that |w| = r and w is - a repetition for (u, v). - - We denote by local_period(u, v) the smallest local period of (u, v). We sometimes - call this *the local period* of (u, v). Provided that x = uv is non-empty, this - is well-defined (because each non-empty word has at least one factorization, as - noted above). - - It can be proven that the following is an equivalent definition of a local period - for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for - all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are - defined. (i.e. i > 0 and i + r < |x|). - - Using the above reformulation, it is easy to prove that - - 1 <= local_period(u, v) <= period(uv) - - A factorization (u, v) of x such that local_period(u,v) = period(x) is called a - *critical factorization*. - - The algorithm hinges on the following theorem, which is stated without proof: - - **Critical Factorization Theorem** Any word x has at least one critical - factorization (u, v) such that |u| < period(x). - - The purpose of maximal_suffix is to find such a critical factorization. - -*/ -impl TwoWaySearcher { - #[allow(dead_code)] - fn new(needle: &[u8]) -> TwoWaySearcher { - let (crit_pos_false, period_false) = TwoWaySearcher::maximal_suffix(needle, false); - let (crit_pos_true, period_true) = TwoWaySearcher::maximal_suffix(needle, true); - - let (crit_pos, period) = - if crit_pos_false > crit_pos_true { - (crit_pos_false, period_false) - } else { - (crit_pos_true, period_true) - }; - - // This isn't in the original algorithm, as far as I'm aware. - let byteset = needle.iter() - .fold(0, |a, &b| (1 << ((b & 0x3f) as usize)) | a); - - // A particularly readable explanation of what's going on here can be found - // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically - // see the code for "Algorithm CP" on p. 323. - // - // What's going on is we have some critical factorization (u, v) of the - // needle, and we want to determine whether u is a suffix of - // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use - // "Algorithm CP2", which is optimized for when the period of the needle - // is large. - if &needle[..crit_pos] == &needle[period.. period + crit_pos] { - TwoWaySearcher { - crit_pos: crit_pos, - period: period, - byteset: byteset, - - position: 0, - memory: 0 - } - } else { - TwoWaySearcher { - crit_pos: crit_pos, - period: cmp::max(crit_pos, needle.len() - crit_pos) + 1, - byteset: byteset, - - position: 0, - memory: usize::MAX // Dummy value to signify that the period is long - } - } - } - - // One of the main ideas of Two-Way is that we factorize the needle into - // two halves, (u, v), and begin trying to find v in the haystack by scanning - // left to right. If v matches, we try to match u by scanning right to left. - // How far we can jump when we encounter a mismatch is all based on the fact - // that (u, v) is a critical factorization for the needle. - #[inline] - fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) - -> Option<(usize, usize)> { - 'search: loop { - // Check that we have room to search in - if self.position + needle.len() > haystack.len() { - return None; - } - - // Quickly skip by large portions unrelated to our substring - if (self.byteset >> - ((haystack[self.position + needle.len() - 1] & 0x3f) - as usize)) & 1 == 0 { - self.position += needle.len(); - if !long_period { - self.memory = 0; - } - continue 'search; - } - - // See if the right part of the needle matches - let start = if long_period { self.crit_pos } - else { cmp::max(self.crit_pos, self.memory) }; - for i in start..needle.len() { - if needle[i] != haystack[self.position + i] { - self.position += i - self.crit_pos + 1; - if !long_period { - self.memory = 0; - } - continue 'search; - } - } - - // See if the left part of the needle matches - let start = if long_period { 0 } else { self.memory }; - for i in (start..self.crit_pos).rev() { - if needle[i] != haystack[self.position + i] { - self.position += self.period; - if !long_period { - self.memory = needle.len() - self.period; - } - continue 'search; - } - } - - // We have found a match! - let match_pos = self.position; - self.position += needle.len(); // add self.period for all matches - if !long_period { - self.memory = 0; // set to needle.len() - self.period for all matches - } - return Some((match_pos, match_pos + needle.len())); - } - } - - // Computes a critical factorization (u, v) of `arr`. - // Specifically, returns (i, p), where i is the starting index of v in some - // critical factorization (u, v) and p = period(v) - #[inline] - #[allow(dead_code)] - #[allow(deprecated)] - fn maximal_suffix(arr: &[u8], reversed: bool) -> (usize, usize) { - let mut left: usize = !0; // Corresponds to i in the paper - let mut right = 0; // Corresponds to j in the paper - let mut offset = 1; // Corresponds to k in the paper - let mut period = 1; // Corresponds to p in the paper - - while right + offset < arr.len() { - let a; - let b; - if reversed { - a = arr[left.wrapping_add(offset)]; - b = arr[right + offset]; - } else { - a = arr[right + offset]; - b = arr[left.wrapping_add(offset)]; - } - if a < b { - // Suffix is smaller, period is entire prefix so far. - right += offset; - offset = 1; - period = right.wrapping_sub(left); - } else if a == b { - // Advance through repetition of the current period. - if offset == period { - right += offset; - offset = 1; - } else { - offset += 1; - } - } else { - // Suffix is larger, start over from current location. - left = right; - right += 1; - offset = 1; - period = 1; - } - } - (left.wrapping_add(1), period) - } -} - -/// The internal state of an iterator that searches for matches of a substring -/// within a larger string using a dynamically chosen search algorithm -#[derive(Clone)] -// NB: This is kept around for convenience because -// it is planned to be used again in the future -enum OldSearcher { - TwoWay(TwoWaySearcher), - TwoWayLong(TwoWaySearcher), -} - -impl OldSearcher { - #[allow(dead_code)] - fn new(haystack: &[u8], needle: &[u8]) -> OldSearcher { - if needle.is_empty() { - // Handle specially - unimplemented!() - // FIXME: Tune this. - // FIXME(#16715): This unsigned integer addition will probably not - // overflow because that would mean that the memory almost solely - // consists of the needle. Needs #16715 to be formally fixed. - } else if needle.len() + 20 > haystack.len() { - // Use naive searcher - unimplemented!() - } else { - let searcher = TwoWaySearcher::new(needle); - if searcher.memory == usize::MAX { // If the period is long - TwoWayLong(searcher) - } else { - TwoWay(searcher) - } - } - } -} - -#[derive(Clone)] -// NB: This is kept around for convenience because -// it is planned to be used again in the future -struct OldMatchIndices<'a, 'b> { - // constants - haystack: &'a str, - needle: &'b str, - searcher: OldSearcher -} - -impl<'a, 'b> OldMatchIndices<'a, 'b> { - #[inline] - #[allow(dead_code)] - fn next(&mut self) -> Option<(usize, usize)> { - match self.searcher { - TwoWay(ref mut searcher) - => searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), false), - TwoWayLong(ref mut searcher) - => searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), true), - } - } -} - /* Section: Comparing strings */ diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index 8bdbab55211d8..de40a57fe9c17 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -17,6 +17,8 @@ reason = "API not fully fleshed out and ready to be stabilized")] use prelude::*; +use core::cmp; +use usize; // Pattern @@ -345,141 +347,16 @@ impl<'a, C: CharEq> DoubleEndedSearcher<'a> for CharEqSearcher<'a, C> {} // Impl for &str ///////////////////////////////////////////////////////////////////////////// -// Todo: Optimize the naive implementation here - -/// Associated type for `<&str as Pattern<'a>>::Searcher`. -#[derive(Clone)] -pub struct StrSearcher<'a, 'b> { - haystack: &'a str, - needle: &'b str, - start: usize, - end: usize, - state: State, -} - -#[derive(Clone, PartialEq)] -enum State { Done, NotDone, Reject(usize, usize) } -impl State { - #[inline] fn done(&self) -> bool { *self == State::Done } - #[inline] fn take(&mut self) -> State { ::mem::replace(self, State::NotDone) } -} - /// Non-allocating substring search. /// -/// Will handle the pattern `""` as returning empty matches at each utf8 +/// Will handle the pattern `""` as returning empty matches at each character /// boundary. impl<'a, 'b> Pattern<'a> for &'b str { type Searcher = StrSearcher<'a, 'b>; #[inline] fn into_searcher(self, haystack: &'a str) -> StrSearcher<'a, 'b> { - StrSearcher { - haystack: haystack, - needle: self, - start: 0, - end: haystack.len(), - state: State::NotDone, - } - } -} - -unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> { - #[inline] - fn haystack(&self) -> &'a str { - self.haystack - } - - #[inline] - fn next(&mut self) -> SearchStep { - str_search_step(self, - |m: &mut StrSearcher| { - // Forward step for empty needle - let current_start = m.start; - if !m.state.done() { - m.start = m.haystack.char_range_at(current_start).next; - m.state = State::Reject(current_start, m.start); - } - SearchStep::Match(current_start, current_start) - }, - |m: &mut StrSearcher| { - // Forward step for nonempty needle - let current_start = m.start; - // Compare byte window because this might break utf8 boundaries - let possible_match = &m.haystack.as_bytes()[m.start .. m.start + m.needle.len()]; - if possible_match == m.needle.as_bytes() { - m.start += m.needle.len(); - SearchStep::Match(current_start, m.start) - } else { - // Skip a char - let haystack_suffix = &m.haystack[m.start..]; - m.start += haystack_suffix.chars().next().unwrap().len_utf8(); - SearchStep::Reject(current_start, m.start) - } - }) - } -} - -unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> { - #[inline] - fn next_back(&mut self) -> SearchStep { - str_search_step(self, - |m: &mut StrSearcher| { - // Backward step for empty needle - let current_end = m.end; - if !m.state.done() { - m.end = m.haystack.char_range_at_reverse(current_end).next; - m.state = State::Reject(m.end, current_end); - } - SearchStep::Match(current_end, current_end) - }, - |m: &mut StrSearcher| { - // Backward step for nonempty needle - let current_end = m.end; - // Compare byte window because this might break utf8 boundaries - let possible_match = &m.haystack.as_bytes()[m.end - m.needle.len() .. m.end]; - if possible_match == m.needle.as_bytes() { - m.end -= m.needle.len(); - SearchStep::Match(m.end, current_end) - } else { - // Skip a char - let haystack_prefix = &m.haystack[..m.end]; - m.end -= haystack_prefix.chars().rev().next().unwrap().len_utf8(); - SearchStep::Reject(m.end, current_end) - } - }) - } -} - -// Helper function for encapsulating the common control flow -// of doing a search step from the front or doing a search step from the back -fn str_search_step(mut m: &mut StrSearcher, - empty_needle_step: F, - nonempty_needle_step: G) -> SearchStep - where F: FnOnce(&mut StrSearcher) -> SearchStep, - G: FnOnce(&mut StrSearcher) -> SearchStep -{ - if m.state.done() { - SearchStep::Done - } else if m.needle.is_empty() && m.start <= m.end { - // Case for needle == "" - if let State::Reject(a, b) = m.state.take() { - SearchStep::Reject(a, b) - } else { - if m.start == m.end { - m.state = State::Done; - } - empty_needle_step(&mut m) - } - } else if m.start + m.needle.len() <= m.end { - // Case for needle != "" - nonempty_needle_step(&mut m) - } else if m.start < m.end { - // Remaining slice shorter than needle, reject it - m.state = State::Done; - SearchStep::Reject(m.start, m.end) - } else { - m.state = State::Done; - SearchStep::Done + StrSearcher::new(haystack, self) } } @@ -633,3 +510,464 @@ impl<'a, F> Pattern<'a> for F where F: FnMut(char) -> bool { impl<'a, 'b> Pattern<'a> for &'b &'b str { pattern_methods!(StrSearcher<'a, 'b>, |&s| s, |s| s); } + + +///////////////////////////////////////////////////////////////////////////// +// Two Way substring searcher +///////////////////////////////////////////////////////////////////////////// + +#[derive(Clone, Debug)] +/// Associated type for `<&str as Pattern<'a>>::Searcher`. +pub struct StrSearcher<'a, 'b> { + haystack: &'a str, + needle: &'b str, + + searcher: StrSearcherImpl, +} + +#[derive(Clone, Debug)] +enum StrSearcherImpl { + Empty(EmptyNeedle), + TwoWay { + last_match_fw: Option<(usize, usize)>, + last_match_bw: Option<(usize, usize)>, + searcher: TwoWaySearcher, + } +} + +#[derive(Clone, Debug)] +struct EmptyNeedle { + position: usize, + end: usize, + is_match_fw: bool, + is_match_bw: bool, +} + +impl<'a, 'b> StrSearcher<'a, 'b> { + fn new(haystack: &'a str, needle: &'b str) -> StrSearcher<'a, 'b> { + if needle.is_empty() { + StrSearcher { + haystack: haystack, + needle: needle, + searcher: StrSearcherImpl::Empty(EmptyNeedle { + position: 0, + end: haystack.len(), + is_match_fw: true, + is_match_bw: true, + }), + } + } else { + StrSearcher { + haystack: haystack, + needle: needle, + searcher: StrSearcherImpl::TwoWay { + last_match_fw: None, + last_match_bw: None, + searcher: TwoWaySearcher::new(needle.as_bytes(), haystack.len()) + }, + } + } + } +} + +unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> { + fn haystack(&self) -> &'a str { self.haystack } + + #[inline] + fn next(&mut self) -> SearchStep { + match self.searcher { + StrSearcherImpl::Empty(ref mut searcher) => { + // empty needle rejects every char and matches every empty string between them + let is_match = searcher.is_match_fw; + searcher.is_match_fw = !searcher.is_match_fw; + let pos = searcher.position; + match self.haystack[pos..].chars().next() { + _ if is_match => SearchStep::Match(pos, pos), + None => SearchStep::Done, + Some(ch) => { + searcher.position += ch.len_utf8(); + SearchStep::Reject(pos, searcher.position) + } + } + } + StrSearcherImpl::TwoWay { ref mut last_match_fw, ref mut searcher, .. } => { + // TwoWaySearcher produces valid *Match* indices that split at char boundaries + // as long as it does correct matching and that haystack and needle are + // valid UTF-8 + // *Rejects* fall on the same indices (the intervals between matches) + // so they are always on character boundaries. + if let Some((a, b)) = last_match_fw.take() { + return SearchStep::Match(a, b); + } + let last_pos = searcher.position; + let is_long = searcher.memory == usize::MAX; + let next_match = searcher.next(self.haystack.as_bytes(), + self.needle.as_bytes(), + is_long); + match next_match { + None => if last_pos != self.haystack.len() { + SearchStep::Reject(last_pos, self.haystack.len()) + } else { + SearchStep::Done + }, + Some((a, b)) => { + if a == last_pos { + SearchStep::Match(a, b) + } else { + *last_match_fw = Some((a, b)); + SearchStep::Reject(last_pos, a) + } + } + } + } + } + } + +} +unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> { + #[inline] + fn next_back(&mut self) -> SearchStep { + match self.searcher { + StrSearcherImpl::Empty(ref mut searcher) => { + let is_match = searcher.is_match_bw; + searcher.is_match_bw = !searcher.is_match_bw; + let end = searcher.end; + match self.haystack[..end].chars().next_back() { + _ if is_match => SearchStep::Match(end, end), + None => SearchStep::Done, + Some(ch) => { + searcher.end -= ch.len_utf8(); + SearchStep::Reject(searcher.end, end) + } + } + } + StrSearcherImpl::TwoWay { ref mut last_match_bw, ref mut searcher, .. } => { + if let Some((a, b)) = last_match_bw.take() { + return SearchStep::Match(a, b); + } + let last_end = searcher.end; + let next_match = searcher.next_back(self.haystack.as_bytes(), + self.needle.as_bytes()); + match next_match { + None => if last_end != 0 { + SearchStep::Reject(0, last_end) + } else { + SearchStep::Done + }, + Some((a, b)) => { + if b == last_end { + SearchStep::Match(a, b) + } else { + *last_match_bw = Some((a, b)); + SearchStep::Reject(b, last_end) + } + } + } + } + } + } +} + +/// The internal state of an iterator that searches for matches of a substring +/// within a larger string using two-way search +#[derive(Clone, Debug)] +struct TwoWaySearcher { + // constants + crit_pos: usize, + period: usize, + byteset: u64, + + // variables + position: usize, + end: usize, + memory: usize +} + +/* + This is the Two-Way search algorithm, which was introduced in the paper: + Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675. + + Here's some background information. + + A *word* is a string of symbols. The *length* of a word should be a familiar + notion, and here we denote it for any word x by |x|. + (We also allow for the possibility of the *empty word*, a word of length zero). + + If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a + *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p]. + For example, both 1 and 2 are periods for the string "aa". As another example, + the only period of the string "abcd" is 4. + + We denote by period(x) the *smallest* period of x (provided that x is non-empty). + This is always well-defined since every non-empty word x has at least one period, + |x|. We sometimes call this *the period* of x. + + If u, v and x are words such that x = uv, where uv is the concatenation of u and + v, then we say that (u, v) is a *factorization* of x. + + Let (u, v) be a factorization for a word x. Then if w is a non-empty word such + that both of the following hold + + - either w is a suffix of u or u is a suffix of w + - either w is a prefix of v or v is a prefix of w + + then w is said to be a *repetition* for the factorization (u, v). + + Just to unpack this, there are four possibilities here. Let w = "abc". Then we + might have: + + - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde") + - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab") + - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi") + - u is a suffix of w and v is a prefix of w. ex: ("bc", "a") + + Note that the word vu is a repetition for any factorization (u,v) of x = uv, + so every factorization has at least one repetition. + + If x is a string and (u, v) is a factorization for x, then a *local period* for + (u, v) is an integer r such that there is some word w such that |w| = r and w is + a repetition for (u, v). + + We denote by local_period(u, v) the smallest local period of (u, v). We sometimes + call this *the local period* of (u, v). Provided that x = uv is non-empty, this + is well-defined (because each non-empty word has at least one factorization, as + noted above). + + It can be proven that the following is an equivalent definition of a local period + for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for + all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are + defined. (i.e. i > 0 and i + r < |x|). + + Using the above reformulation, it is easy to prove that + + 1 <= local_period(u, v) <= period(uv) + + A factorization (u, v) of x such that local_period(u,v) = period(x) is called a + *critical factorization*. + + The algorithm hinges on the following theorem, which is stated without proof: + + **Critical Factorization Theorem** Any word x has at least one critical + factorization (u, v) such that |u| < period(x). + + The purpose of maximal_suffix is to find such a critical factorization. + +*/ +impl TwoWaySearcher { + fn new(needle: &[u8], end: usize) -> TwoWaySearcher { + let (crit_pos_false, period_false) = TwoWaySearcher::maximal_suffix(needle, false); + let (crit_pos_true, period_true) = TwoWaySearcher::maximal_suffix(needle, true); + + let (crit_pos, period) = + if crit_pos_false > crit_pos_true { + (crit_pos_false, period_false) + } else { + (crit_pos_true, period_true) + }; + + // This isn't in the original algorithm, as far as I'm aware. + let byteset = needle.iter() + .fold(0, |a, &b| (1 << ((b & 0x3f) as usize)) | a); + + // A particularly readable explanation of what's going on here can be found + // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically + // see the code for "Algorithm CP" on p. 323. + // + // What's going on is we have some critical factorization (u, v) of the + // needle, and we want to determine whether u is a suffix of + // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use + // "Algorithm CP2", which is optimized for when the period of the needle + // is large. + if &needle[..crit_pos] == &needle[period.. period + crit_pos] { + // short period case + TwoWaySearcher { + crit_pos: crit_pos, + period: period, + byteset: byteset, + + position: 0, + end: end, + memory: 0 + } + } else { + // long period case + // we have an approximation to the actual period, and don't use memory. + TwoWaySearcher { + crit_pos: crit_pos, + period: cmp::max(crit_pos, needle.len() - crit_pos) + 1, + byteset: byteset, + + position: 0, + end: end, + memory: usize::MAX // Dummy value to signify that the period is long + } + } + } + + #[inline(always)] + fn byteset_contains(&self, byte: u8) -> bool { + (self.byteset >> ((byte & 0x3f) as usize)) & 1 != 0 + } + + // One of the main ideas of Two-Way is that we factorize the needle into + // two halves, (u, v), and begin trying to find v in the haystack by scanning + // left to right. If v matches, we try to match u by scanning right to left. + // How far we can jump when we encounter a mismatch is all based on the fact + // that (u, v) is a critical factorization for the needle. + #[inline] + fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) + -> Option<(usize, usize)> { + // `next()` uses `self.position` as its cursor + 'search: loop { + // Check that we have room to search in + if self.position + needle.len() > haystack.len() { + self.position = haystack.len(); + return None; + } + + // Quickly skip by large portions unrelated to our substring + if !self.byteset_contains(haystack[self.position + needle.len() - 1]) { + self.position += needle.len(); + if !long_period { + self.memory = 0; + } + continue 'search; + } + + // See if the right part of the needle matches + let start = if long_period { self.crit_pos } + else { cmp::max(self.crit_pos, self.memory) }; + for i in start..needle.len() { + if needle[i] != haystack[self.position + i] { + self.position += i - self.crit_pos + 1; + if !long_period { + self.memory = 0; + } + continue 'search; + } + } + + // See if the left part of the needle matches + let start = if long_period { 0 } else { self.memory }; + for i in (start..self.crit_pos).rev() { + if needle[i] != haystack[self.position + i] { + self.position += self.period; + if !long_period { + self.memory = needle.len() - self.period; + } + continue 'search; + } + } + + // We have found a match! + let match_pos = self.position; + + // Note: add self.period instead of needle.len() to have overlapping matches + self.position += needle.len(); + if !long_period { + self.memory = 0; // set to needle.len() - self.period for overlapping matches + } + + return Some((match_pos, match_pos + needle.len())); + } + } + + // Follows the ideas in `next()`. + // + // All the definitions are completely symmetrical, with period(x) = period(reverse(x)) + // and local_period(u, v) = local_period(reverse(v), reverse(u)), so if (u, v) + // is a critical factorization, so is (reverse(v), reverse(u)). Similarly, + // the "period" stored in self.period is the real period if long_period is + // false, and so is still valid for a reversed needle, and if long_period is + // true, all the algorithm requires is that self.period is less than or + // equal to the real period, which must be true for the forward case anyway. + // + // To search in reverse through the haystack, we search forward through + // a reversed haystack with a reversed needle, and the above paragraph shows + // that the precomputed parameters can be left alone. + #[inline] + fn next_back(&mut self, haystack: &[u8], needle: &[u8]) -> Option<(usize, usize)> { + // `next_back()` uses `self.end` as its cursor -- so that `next()` and `next_back()` + // are independent. + 'search: loop { + // Check that we have room to search in + if needle.len() > self.end { + self.end = 0; + return None; + } + + // Quickly skip by large portions unrelated to our substring + if !self.byteset_contains(haystack[self.end - needle.len()]) { + self.end -= needle.len(); + continue 'search; + } + + // See if the left part of the needle matches + for i in (0..self.crit_pos).rev() { + if needle[i] != haystack[self.end - needle.len() + i] { + self.end -= self.crit_pos - i; + continue 'search; + } + } + + // See if the right part of the needle matches + for i in self.crit_pos..needle.len() { + if needle[i] != haystack[self.end - needle.len() + i] { + self.end -= self.period; + continue 'search; + } + } + + // We have found a match! + let match_pos = self.end - needle.len(); + // Note: sub self.period instead of needle.len() to have overlapping matches + self.end -= needle.len(); + + return Some((match_pos, match_pos + needle.len())); + } + } + + // Computes a critical factorization (u, v) of `arr`. + // Specifically, returns (i, p), where i is the starting index of v in some + // critical factorization (u, v) and p = period(v) + #[inline] + fn maximal_suffix(arr: &[u8], reversed: bool) -> (usize, usize) { + let mut left: usize = !0; // Corresponds to i in the paper + let mut right = 0; // Corresponds to j in the paper + let mut offset = 1; // Corresponds to k in the paper + let mut period = 1; // Corresponds to p in the paper + + while right + offset < arr.len() { + let a; + let b; + if reversed { + a = arr[left.wrapping_add(offset)]; + b = arr[right + offset]; + } else { + a = arr[right + offset]; + b = arr[left.wrapping_add(offset)]; + } + if a < b { + // Suffix is smaller, period is entire prefix so far. + right += offset; + offset = 1; + period = right.wrapping_sub(left); + } else if a == b { + // Advance through repetition of the current period. + if offset == period { + right += offset; + offset = 1; + } else { + offset += 1; + } + } else { + // Suffix is larger, start over from current location. + left = right; + right += 1; + offset = 1; + period = 1; + } + } + (left.wrapping_add(1), period) + } +} From a6dd2031a363d0b46caea06ef77209f678327719 Mon Sep 17 00:00:00 2001 From: Ulrik Sverdrup Date: Wed, 17 Jun 2015 18:35:43 +0200 Subject: [PATCH 2/4] StrSearcher: Specialize is_prefix_of/is_suffix_of for &str --- src/libcore/str/pattern.rs | 50 +++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index de40a57fe9c17..048bcc0dce53f 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -343,23 +343,6 @@ unsafe impl<'a, C: CharEq> ReverseSearcher<'a> for CharEqSearcher<'a, C> { impl<'a, C: CharEq> DoubleEndedSearcher<'a> for CharEqSearcher<'a, C> {} -///////////////////////////////////////////////////////////////////////////// -// Impl for &str -///////////////////////////////////////////////////////////////////////////// - -/// Non-allocating substring search. -/// -/// Will handle the pattern `""` as returning empty matches at each character -/// boundary. -impl<'a, 'b> Pattern<'a> for &'b str { - type Searcher = StrSearcher<'a, 'b>; - - #[inline] - fn into_searcher(self, haystack: &'a str) -> StrSearcher<'a, 'b> { - StrSearcher::new(haystack, self) - } -} - ///////////////////////////////////////////////////////////////////////////// macro_rules! pattern_methods { @@ -511,6 +494,39 @@ impl<'a, 'b> Pattern<'a> for &'b &'b str { pattern_methods!(StrSearcher<'a, 'b>, |&s| s, |s| s); } +///////////////////////////////////////////////////////////////////////////// +// Impl for &str +///////////////////////////////////////////////////////////////////////////// + +/// Non-allocating substring search. +/// +/// Will handle the pattern `""` as returning empty matches at each character +/// boundary. +impl<'a, 'b> Pattern<'a> for &'b str { + type Searcher = StrSearcher<'a, 'b>; + + #[inline] + fn into_searcher(self, haystack: &'a str) -> StrSearcher<'a, 'b> { + StrSearcher::new(haystack, self) + } + + /// Checks whether the pattern matches at the front of the haystack + #[inline] + fn is_prefix_of(self, haystack: &'a str) -> bool { + // Use `as_bytes` so that we can slice through a character in the haystack. + // Since self is always valid UTF-8, this can't result in a false positive. + self.len() <= haystack.len() && + self.as_bytes() == &haystack.as_bytes()[..self.len()] + } + + /// Checks whether the pattern matches at the back of the haystack + #[inline] + fn is_suffix_of(self, haystack: &'a str) -> bool { + self.len() <= haystack.len() && + self.as_bytes() == &haystack.as_bytes()[haystack.len() - self.len()..] + } +} + ///////////////////////////////////////////////////////////////////////////// // Two Way substring searcher From 71006bd6545015d7ae8e4c18589812bc1f6373dd Mon Sep 17 00:00:00 2001 From: Ulrik Sverdrup Date: Sun, 21 Jun 2015 19:44:25 +0200 Subject: [PATCH 3/4] StrSearcher: Use trait to specialize two way algorithm by case Use a trait to be able to implement both the fast search that skips to each match, and the slower search that emits `Reject` intervals regularly. The latter is important for uses of `next_reject`. --- src/libcore/str/pattern.rs | 190 ++++++++++++++++++++++++++----------- 1 file changed, 133 insertions(+), 57 deletions(-) diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index 048bcc0dce53f..6d68615a8cfd7 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -544,11 +544,7 @@ pub struct StrSearcher<'a, 'b> { #[derive(Clone, Debug)] enum StrSearcherImpl { Empty(EmptyNeedle), - TwoWay { - last_match_fw: Option<(usize, usize)>, - last_match_bw: Option<(usize, usize)>, - searcher: TwoWaySearcher, - } + TwoWay(TwoWaySearcher), } #[derive(Clone, Debug)] @@ -576,11 +572,9 @@ impl<'a, 'b> StrSearcher<'a, 'b> { StrSearcher { haystack: haystack, needle: needle, - searcher: StrSearcherImpl::TwoWay { - last_match_fw: None, - last_match_bw: None, - searcher: TwoWaySearcher::new(needle.as_bytes(), haystack.len()) - }, + searcher: StrSearcherImpl::TwoWay( + TwoWaySearcher::new(needle.as_bytes(), haystack.len()) + ), } } } @@ -606,39 +600,55 @@ unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> { } } } - StrSearcherImpl::TwoWay { ref mut last_match_fw, ref mut searcher, .. } => { + StrSearcherImpl::TwoWay(ref mut searcher) => { // TwoWaySearcher produces valid *Match* indices that split at char boundaries // as long as it does correct matching and that haystack and needle are // valid UTF-8 - // *Rejects* fall on the same indices (the intervals between matches) - // so they are always on character boundaries. - if let Some((a, b)) = last_match_fw.take() { - return SearchStep::Match(a, b); + // *Rejects* from the algorithm can fall on any indices, but we will walk them + // manually to the next character boundary, so that they are utf-8 safe. + if searcher.position == self.haystack.len() { + return SearchStep::Done; } - let last_pos = searcher.position; let is_long = searcher.memory == usize::MAX; - let next_match = searcher.next(self.haystack.as_bytes(), - self.needle.as_bytes(), - is_long); - match next_match { - None => if last_pos != self.haystack.len() { - SearchStep::Reject(last_pos, self.haystack.len()) - } else { - SearchStep::Done - }, - Some((a, b)) => { - if a == last_pos { - SearchStep::Match(a, b) - } else { - *last_match_fw = Some((a, b)); - SearchStep::Reject(last_pos, a) + match searcher.next::(self.haystack.as_bytes(), + self.needle.as_bytes(), + is_long) + { + SearchStep::Reject(a, mut b) => { + // skip to next char boundary + while !self.haystack.is_char_boundary(b) { + b += 1; } + searcher.position = cmp::max(b, searcher.position); + SearchStep::Reject(a, b) } + otherwise => otherwise, } } } } + #[inline] + fn next_match(&mut self) -> Option<(usize, usize)> { + match self.searcher { + StrSearcherImpl::Empty(..) => { + loop { + match self.next() { + SearchStep::Match(a, b) => return Some((a, b)), + SearchStep::Done => return None, + SearchStep::Reject(..) => { } + } + } + } + StrSearcherImpl::TwoWay(ref mut searcher) => { + let is_long = searcher.memory == usize::MAX; + searcher.next::(self.haystack.as_bytes(), + self.needle.as_bytes(), + is_long) + } + } + } + } unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> { #[inline] @@ -657,31 +667,45 @@ unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> { } } } - StrSearcherImpl::TwoWay { ref mut last_match_bw, ref mut searcher, .. } => { - if let Some((a, b)) = last_match_bw.take() { - return SearchStep::Match(a, b); + StrSearcherImpl::TwoWay(ref mut searcher) => { + if searcher.end == 0 { + return SearchStep::Done; } - let last_end = searcher.end; - let next_match = searcher.next_back(self.haystack.as_bytes(), - self.needle.as_bytes()); - match next_match { - None => if last_end != 0 { - SearchStep::Reject(0, last_end) - } else { - SearchStep::Done - }, - Some((a, b)) => { - if b == last_end { - SearchStep::Match(a, b) - } else { - *last_match_bw = Some((a, b)); - SearchStep::Reject(b, last_end) + match searcher.next_back::(self.haystack.as_bytes(), + self.needle.as_bytes()) + { + SearchStep::Reject(mut a, b) => { + // skip to next char boundary + while !self.haystack.is_char_boundary(a) { + a -= 1; } + searcher.end = cmp::min(a, searcher.end); + SearchStep::Reject(a, b) } + otherwise => otherwise, } } } } + + #[inline] + fn next_match_back(&mut self) -> Option<(usize, usize)> { + match self.searcher { + StrSearcherImpl::Empty(..) => { + loop { + match self.next_back() { + SearchStep::Match(a, b) => return Some((a, b)), + SearchStep::Done => return None, + SearchStep::Reject(..) => { } + } + } + } + StrSearcherImpl::TwoWay(ref mut searcher) => { + searcher.next_back::(self.haystack.as_bytes(), + self.needle.as_bytes()) + } + } + } } /// The internal state of an iterator that searches for matches of a substring @@ -831,14 +855,21 @@ impl TwoWaySearcher { // How far we can jump when we encounter a mismatch is all based on the fact // that (u, v) is a critical factorization for the needle. #[inline] - fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) - -> Option<(usize, usize)> { + fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) + -> S::Output + where S: TwoWayStrategy + { // `next()` uses `self.position` as its cursor + let old_pos = self.position; 'search: loop { // Check that we have room to search in - if self.position + needle.len() > haystack.len() { + if needle.len() > haystack.len() - self.position { self.position = haystack.len(); - return None; + return S::rejecting(old_pos, self.position); + } + + if S::use_early_reject() && old_pos != self.position { + return S::rejecting(old_pos, self.position); } // Quickly skip by large portions unrelated to our substring @@ -884,7 +915,7 @@ impl TwoWaySearcher { self.memory = 0; // set to needle.len() - self.period for overlapping matches } - return Some((match_pos, match_pos + needle.len())); + return S::matching(match_pos, match_pos + needle.len()); } } @@ -902,14 +933,22 @@ impl TwoWaySearcher { // a reversed haystack with a reversed needle, and the above paragraph shows // that the precomputed parameters can be left alone. #[inline] - fn next_back(&mut self, haystack: &[u8], needle: &[u8]) -> Option<(usize, usize)> { + fn next_back(&mut self, haystack: &[u8], needle: &[u8]) + -> S::Output + where S: TwoWayStrategy + { // `next_back()` uses `self.end` as its cursor -- so that `next()` and `next_back()` // are independent. + let old_end = self.end; 'search: loop { // Check that we have room to search in if needle.len() > self.end { self.end = 0; - return None; + return S::rejecting(0, old_end); + } + + if S::use_early_reject() && old_end != self.end { + return S::rejecting(self.end, old_end); } // Quickly skip by large portions unrelated to our substring @@ -939,7 +978,7 @@ impl TwoWaySearcher { // Note: sub self.period instead of needle.len() to have overlapping matches self.end -= needle.len(); - return Some((match_pos, match_pos + needle.len())); + return S::matching(match_pos, match_pos + needle.len()); } } @@ -987,3 +1026,40 @@ impl TwoWaySearcher { (left.wrapping_add(1), period) } } + +// TwoWayStrategy allows the algorithm to either skip non-matches as quickly +// as possible, or to work in a mode where it emits Rejects relatively quickly. +trait TwoWayStrategy { + type Output; + fn use_early_reject() -> bool; + fn rejecting(usize, usize) -> Self::Output; + fn matching(usize, usize) -> Self::Output; +} + +/// Skip to match intervals as quickly as possible +enum MatchOnly { } + +impl TwoWayStrategy for MatchOnly { + type Output = Option<(usize, usize)>; + + #[inline] + fn use_early_reject() -> bool { false } + #[inline] + fn rejecting(_a: usize, _b: usize) -> Self::Output { None } + #[inline] + fn matching(a: usize, b: usize) -> Self::Output { Some((a, b)) } +} + +/// Emit Rejects regularly +enum RejectAndMatch { } + +impl TwoWayStrategy for RejectAndMatch { + type Output = SearchStep; + + #[inline] + fn use_early_reject() -> bool { true } + #[inline] + fn rejecting(a: usize, b: usize) -> Self::Output { SearchStep::Reject(a, b) } + #[inline] + fn matching(a: usize, b: usize) -> Self::Output { SearchStep::Match(a, b) } +} From 274bb24efdbeed0ab1a91f3c02f86551ef16eac7 Mon Sep 17 00:00:00 2001 From: Ulrik Sverdrup Date: Wed, 24 Jun 2015 16:22:09 +0200 Subject: [PATCH 4/4] StrSearcher: Explicitly separate the long and short cases This is needed to not drop performance, after the trait-based changes. Force separate versions of the next method to be generated for the short and long period cases. --- src/libcore/str/pattern.rs | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index 6d68615a8cfd7..dccdaa9120d10 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -628,7 +628,7 @@ unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> { } } - #[inline] + #[inline(always)] fn next_match(&mut self) -> Option<(usize, usize)> { match self.searcher { StrSearcherImpl::Empty(..) => { @@ -642,9 +642,15 @@ unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> { } StrSearcherImpl::TwoWay(ref mut searcher) => { let is_long = searcher.memory == usize::MAX; - searcher.next::(self.haystack.as_bytes(), - self.needle.as_bytes(), - is_long) + if is_long { + searcher.next::(self.haystack.as_bytes(), + self.needle.as_bytes(), + true) + } else { + searcher.next::(self.haystack.as_bytes(), + self.needle.as_bytes(), + false) + } } } } @@ -854,7 +860,7 @@ impl TwoWaySearcher { // left to right. If v matches, we try to match u by scanning right to left. // How far we can jump when we encounter a mismatch is all based on the fact // that (u, v) is a critical factorization for the needle. - #[inline] + #[inline(always)] fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) -> S::Output where S: TwoWayStrategy