Skip to content

Commit

Permalink
feat(dictgen): Add aho-corasick support
Browse files Browse the repository at this point in the history
  • Loading branch information
epage committed Dec 31, 2024
1 parent 44cf2f8 commit 74d9cdb
Show file tree
Hide file tree
Showing 9 changed files with 138,279 additions and 3 deletions.
5 changes: 3 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions crates/dictgen/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,14 @@ default = ["std"]
std = []
codegen = ["std", "dep:phf_codegen"]
map = ["dep:phf", "dep:phf_shared"]
aho-corasick = ["dep:aho-corasick"]

[dependencies]
unicase = "2.7"
phf = { version = "0.11", features = ["unicase"], optional = true }
phf_shared = { version = "0.11", optional = true }
phf_codegen = { version = "0.11", optional = true }
aho-corasick = { version = "1.1.3", optional = true }

[lints]
workspace = true
112 changes: 112 additions & 0 deletions crates/dictgen/src/aho_corasick.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
pub use ::aho_corasick::automaton::Automaton;
pub use ::aho_corasick::dfa::Builder;
pub use ::aho_corasick::dfa::DFA;
pub use ::aho_corasick::Anchored;
pub use ::aho_corasick::Input;
pub use ::aho_corasick::MatchKind;
pub use ::aho_corasick::StartKind;

#[cfg(feature = "codegen")]
pub struct AhoCorasickGen<'g> {
pub(crate) gen: crate::DictGen<'g>,
}

#[cfg(feature = "codegen")]
impl AhoCorasickGen<'_> {
pub fn write<W: std::io::Write, V: std::fmt::Display>(
&self,
file: &mut W,
data: impl Iterator<Item = (impl AsRef<str>, V)>,
) -> Result<(), std::io::Error> {
let mut data: Vec<_> = data.collect();
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0.as_ref().to_owned()));

let name = self.gen.name;
let value_type = self.gen.value_type;

writeln!(file, "pub struct {name} {{")?;
writeln!(file, " dfa: dictgen::aho_corasick::DFA,")?;
writeln!(file, " unicode: &'static dictgen::OrderedMap<dictgen::InsensitiveStr<'static>, {value_type}>,")?;
writeln!(file, "}}")?;
writeln!(file)?;
writeln!(file, "impl {name} {{")?;
writeln!(file, " pub fn new() -> Self {{")?;
writeln!(
file,
" static NEEDLES: &'static [&'static [u8]] = &["
)?;
for (key, _value) in data.iter().filter(|(k, _)| k.as_ref().is_ascii()) {
let key = key.as_ref();
writeln!(file, " b{key:?},")?;
}
writeln!(file, " ];")?;
writeln!(
file,
" let dfa = dictgen::aho_corasick::Builder::new()"
)?;
writeln!(
file,
" .match_kind(dictgen::aho_corasick::MatchKind::LeftmostLongest)"
)?;
writeln!(
file,
" .start_kind(dictgen::aho_corasick::StartKind::Anchored)"
)?;
writeln!(file, " .ascii_case_insensitive(true)")?;
writeln!(file, " .build(NEEDLES)")?;
writeln!(file, " .unwrap();")?;
crate::DictGen::new()
.name("UNICODE_TABLE")
.value_type(value_type)
.ordered_map()
.write(
file,
data.iter()
.filter(|(k, _)| !k.as_ref().is_ascii())
.map(|(k, v)| (k.as_ref(), v)),
)?;
writeln!(file)?;
writeln!(file, " Self {{")?;
writeln!(file, " dfa,")?;
writeln!(file, " unicode: &UNICODE_TABLE,")?;
writeln!(file, " }}")?;
writeln!(file, " }}")?;
writeln!(file)?;
writeln!(
file,
" pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static {value_type}> {{"
)?;
writeln!(
file,
" static PATTERNID_MAP: &'static [{value_type}] = &["
)?;
for (_key, value) in data.iter().filter(|(k, _)| k.as_ref().is_ascii()) {
writeln!(file, " {value},")?;
}
writeln!(file, " ];")?;
writeln!(file, " if word.is_ascii() {{")?;
writeln!(
file,
" use dictgen::aho_corasick::Automaton as _;"
)?;
writeln!(file, " let input = dictgen::aho_corasick::Input::new(word.into_inner().as_bytes()).anchored(dictgen::aho_corasick::Anchored::Yes);")?;
writeln!(
file,
" let mat = self.dfa.try_find(&input).unwrap()?;"
)?;
writeln!(
file,
" if mat.end() == word.into_inner().len() {{"
)?;
writeln!(file, " return None;")?;
writeln!(file, " }}")?;
writeln!(file, " Some(&PATTERNID_MAP[mat.pattern()])")?;
writeln!(file, " }} else {{")?;
writeln!(file, " self.unicode.find(word)")?;
writeln!(file, " }}")?;
writeln!(file, " }}")?;
writeln!(file, "}}")?;

Ok(())
}
}
5 changes: 5 additions & 0 deletions crates/dictgen/src/gen.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ impl<'g> DictGen<'g> {
pub fn r#match(self) -> crate::MatchGen<'g> {
crate::MatchGen { gen: self }
}

#[cfg(feature = "aho-corasick")]
pub fn aho_corasick(self) -> crate::AhoCorasickGen<'g> {
crate::AhoCorasickGen { gen: self }
}
}

impl Default for DictGen<'static> {
Expand Down
4 changes: 4 additions & 0 deletions crates/dictgen/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#![warn(clippy::print_stderr)]
#![warn(clippy::print_stdout)]

#[cfg(feature = "aho-corasick")]
pub mod aho_corasick;
#[cfg(feature = "codegen")]
mod gen;
mod insensitive;
Expand All @@ -12,6 +14,8 @@ mod r#match;
mod ordered_map;
mod trie;

#[cfg(feature = "aho-corasick")]
pub use aho_corasick::AhoCorasickGen;
#[cfg(feature = "codegen")]
pub use gen::*;
pub use insensitive::*;
Expand Down
2 changes: 1 addition & 1 deletion crates/typos-dict/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ itertools = "0.13"
edit-distance = "2.1"
unicase = "2.7"
codegenrs = "3.0"
dictgen = { version = "^0.2", path = "../dictgen", features = ["codegen", "map"] }
dictgen = { version = "^0.2", path = "../dictgen", features = ["codegen", "map", "aho-corasick"] }
varcon = { version = "^1.0", path = "../varcon" }
snapbox = "0.6.5"
indexmap = "2.2.6"
Expand Down
Loading

0 comments on commit 74d9cdb

Please sign in to comment.