src/string_proc.rs

//! String processing algorithms.

/// Data structure for Knuth-Morris-Pratt string matching against a pattern.
pub struct Matcher<'a, T> {
    /// The string pattern to search for.
    pub pattern: &'a [T],
    /// KMP match failure automaton. fail[i] is the length of the longest
    /// proper prefix-suffix of pattern[0...i].
    pub fail: Vec<usize>,
}

impl<'a, T: Eq> Matcher<'a, T> {
    /// Precomputes the automaton that allows linear-time string matching.
    ///
    /// # Example
    ///
    /// ```
    /// use contest_algorithms::string_proc::Matcher;
    /// let utf8_string = "hello";
    ///
    /// let match_from_byte_literal = Matcher::new(b"hello");
    ///
    /// let match_from_bytes = Matcher::new(utf8_string.as_bytes());
    ///
    /// let vec_char: Vec<char> = utf8_string.chars().collect();
    /// let match_from_chars = Matcher::new(&vec_char);
    ///
    /// let vec_int = vec![4, -3, 1];
    /// let match_from_ints = Matcher::new(&vec_int);
    /// ```
    ///
    /// # Panics
    ///
    /// Panics if pattern is empty.
    pub fn new(pattern: &'a [T]) -> Self {
        let mut fail = Vec::with_capacity(pattern.len());
        fail.push(0);
        let mut len = 0;
        for ch in &pattern[1..] {
            while len > 0 && pattern[len] != *ch {
                len = fail[len - 1];
            }
            if pattern[len] == *ch {
                len += 1;
            }
            fail.push(len);
        }
        Self { pattern, fail }
    }

    /// KMP algorithm, sets matches[i] = length of longest prefix of pattern
    /// matching a suffix of text[0...i].
    pub fn kmp_match(&self, text: &[T]) -> Vec<usize> {
        let mut matches = Vec::with_capacity(text.len());
        let mut len = 0;
        for ch in text {
            if len == self.pattern.len() {
                len = self.fail[len - 1];
            }
            while len > 0 && self.pattern[len] != *ch {
                len = self.fail[len - 1];
            }
            if self.pattern[len] == *ch {
                len += 1;
            }
            matches.push(len);
        }
        matches
    }
}

/// Suffix array data structure, useful for a variety of string queries.
pub struct SuffixArray {
    /// The suffix array itself, holding suffix indices in sorted order.
    pub sfx: Vec<usize>,
    /// rank[i][j] = rank of the j'th suffix, considering only 2^i chars.
    /// In other words, rank[i] is a ranking of the substrings text[j..j+2^i].
    pub rank: Vec<Vec<usize>>,
}

impl SuffixArray {
    /// O(n + max_key) stable sort on the items generated by vals.
    /// Items v in vals are sorted according to val_to_key[v].
    fn counting_sort(
        vals: impl Iterator<Item = usize> + Clone,
        val_to_key: &[usize],
        max_key: usize,
    ) -> Vec<usize> {
        let mut counts = vec![0; max_key];
        for v in vals.clone() {
            counts[val_to_key[v]] += 1;
        }
        let mut total = 0;
        for c in counts.iter_mut() {
            total += *c;
            *c = total - *c;
        }
        let mut result = vec![0; total];
        for v in vals {
            let c = &mut counts[val_to_key[v]];
            result[*c] = v;
            *c += 1;
        }
        result
    }

    /// Suffix array construction in O(n log n) time.
    pub fn new(text: &[u8]) -> Self {
        let n = text.len();
        let init_rank = text.iter().map(|&ch| ch as usize).collect::<Vec<_>>();
        let mut sfx = Self::counting_sort(0..n, &init_rank, 256);
        let mut rank = vec![init_rank];
        // Invariant at the start of every loop iteration:
        // suffixes are sorted according to the first skip characters.
        for skip in (0..).map(|i| 1 << i).take_while(|&skip| skip < n) {
            let prev_rank = rank.last().unwrap();
            let mut cur_rank = prev_rank.clone();

            let pos = (n - skip..n).chain(sfx.into_iter().filter_map(|p| p.checked_sub(skip)));
            sfx = Self::counting_sort(pos, &prev_rank, n.max(256));

            let mut prev = sfx[0];
            cur_rank[prev] = 0;
            for &cur in sfx.iter().skip(1) {
                if prev.max(cur) + skip < n
                    && prev_rank[prev] == prev_rank[cur]
                    && prev_rank[prev + skip] == prev_rank[cur + skip]
                {
                    cur_rank[cur] = cur_rank[prev];
                } else {
                    cur_rank[cur] = cur_rank[prev] + 1;
                }
                prev = cur;
            }
            rank.push(cur_rank);
        }
        Self { sfx, rank }
    }

    /// Computes the length of longest common prefix of text[i..] and text[j..].
    pub fn longest_common_prefix(&self, mut i: usize, mut j: usize) -> usize {
        let mut len = 0;
        for (k, rank) in self.rank.iter().enumerate().rev() {
            if rank[i] == rank[j] {
                i += 1 << k;
                j += 1 << k;
                len += 1 << k;
                if i.max(j) >= self.sfx.len() {
                    break;
                }
            }
        }
        len
    }
}

/// Prefix trie
#[derive(Default)]
pub struct Trie<K: std::hash::Hash + Eq> {
    count: usize,
    branches: std::collections::HashMap<K, Trie<K>>,
}

impl<K: std::hash::Hash + Eq + Default> Trie<K> {
    /// Inserts a word into the trie.
    pub fn insert(&mut self, word: impl IntoIterator<Item = K>) {
        let mut node = self;
        node.count += 1;

        for ch in word {
            node = { node }.branches.entry(ch).or_default();
            node.count += 1;
        }
    }

    /// Computes the number of inserted words that start with the given prefix.
    pub fn get(&self, prefix: impl IntoIterator<Item = K>) -> usize {
        let mut node = self;

        for ch in prefix {
            match node.branches.get(&ch) {
                Some(sub) => node = sub,
                None => return 0,
            }
        }
        node.count
    }
}

/// Manacher's algorithm for computing palindrome substrings in linear time.
/// pal[2*i] = odd length of palindrome centred at text[i].
/// pal[2*i+1] = even length of palindrome centred at text[i+0.5].
///
/// # Panics
///
/// Panics if text is empty.
pub fn palindromes<T: Eq>(text: &[T]) -> Vec<usize> {
    let mut pal = Vec::with_capacity(2 * text.len() - 1); // only mutable var!
    pal.push(1);
    while pal.len() < pal.capacity() {
        let i = pal.len() - 1;
        let max_len = (i + 1).min(pal.capacity() - i);
        while pal[i] < max_len && text[(i - pal[i] - 1) / 2] == text[(i + pal[i] + 1) / 2] {
            pal[i] += 2;
        }
        if pal[i] < 2 {
            let a = 1 - pal[i];
            pal.push(a);
        } else {
            for d in 1.. {
                let (a, b) = (pal[i - d], pal[i] - d);
                if a < b {
                    pal.push(a);
                } else {
                    pal.push(b);
                    break;
                }
            }
        }
    }
    pal
}

#[cfg(test)]
mod test {
    use super::*;

    #[test]
    fn test_kmp() {
        let text = b"banana";
        let pattern = b"ana";

        let matches = Matcher::new(pattern).kmp_match(text);

        assert_eq!(matches, vec![0, 1, 2, 3, 2, 3]);
    }

    #[test]
    fn test_suffix_array() {
        let text1 = b"bobocel";
        let text2 = b"banana";

        let sfx1 = SuffixArray::new(text1);
        let sfx2 = SuffixArray::new(text2);

        assert_eq!(sfx1.sfx, vec![0, 2, 4, 5, 6, 1, 3]);
        assert_eq!(sfx2.sfx, vec![5, 3, 1, 0, 4, 2]);

        assert_eq!(sfx1.longest_common_prefix(0, 2), 2);
        assert_eq!(sfx2.longest_common_prefix(1, 3), 3);

        // Check that sfx and rank.last() are essentially inverses of each other.
        for (p, &r) in sfx1.rank.last().unwrap().iter().enumerate() {
            assert_eq!(sfx1.sfx[r], p);
        }
        for (p, &r) in sfx2.rank.last().unwrap().iter().enumerate() {
            assert_eq!(sfx2.sfx[r], p);
        }
    }

    #[test]
    fn test_trie() {
        let dict = vec!["banana", "benefit", "banapple", "ban"];

        let trie = dict.into_iter().fold(Trie::default(), |mut trie, word| {
            Trie::insert(&mut trie, word.bytes());
            trie
        });

        assert_eq!(trie.get("".bytes()), 4);
        assert_eq!(trie.get("b".bytes()), 4);
        assert_eq!(trie.get("ba".bytes()), 3);
        assert_eq!(trie.get("ban".bytes()), 3);
        assert_eq!(trie.get("bana".bytes()), 2);
        assert_eq!(trie.get("banan".bytes()), 1);
        assert_eq!(trie.get("bane".bytes()), 0);
    }

    #[test]
    fn test_palindrome() {
        let text = b"banana";

        let pal_len = palindromes(text);

        assert_eq!(pal_len, vec![1, 0, 1, 0, 3, 0, 5, 0, 3, 0, 1]);
    }
}