diff --git a/divvunspell/src/archive/boxf.rs b/divvunspell/src/archive/boxf.rs index 5bddb4e..f6b6c05 100644 --- a/divvunspell/src/archive/boxf.rs +++ b/divvunspell/src/archive/boxf.rs @@ -1,3 +1,4 @@ +//! Box-based archive stuff. use std::sync::Arc; use box_format::BoxFileReader; @@ -18,28 +19,35 @@ use crate::transducer::{ use crate::vfs::boxf::Filesystem as BoxFilesystem; use crate::vfs::Filesystem; +/// An archive with mmaped language and error model THFST automata archive. pub type ThfstBoxSpellerArchive = BoxSpellerArchive< MemmapThfstTransducer, MemmapThfstTransducer, >; +/// An archive with mmaped chunked language and error model THFST automata +/// file. pub type ThfstChunkedBoxSpeller = HfstSpeller< crate::vfs::boxf::File, MemmapThfstChunkedTransducer, MemmapThfstChunkedTransducer, >; +/// An archive with mmaped language and error model THFST automata file. pub type ThfstBoxSpeller = HfstSpeller< crate::vfs::boxf::File, MemmapThfstTransducer, MemmapThfstTransducer, >; +/// An archive with mmaped chunked language and error model THFST automata +/// archive. pub type ThfstChunkedBoxSpellerArchive = BoxSpellerArchive< MemmapThfstChunkedTransducer, MemmapThfstChunkedTransducer, >; +/// Speller in box archive. pub struct BoxSpellerArchive where T: Transducer, @@ -54,6 +62,7 @@ where T: Transducer + Send + Sync + 'static, U: Transducer + Send + Sync + 'static, { + /// get the spell-checking component pub fn hfst_speller(&self) -> Arc> { self.speller.clone() } diff --git a/divvunspell/src/archive/error.rs b/divvunspell/src/archive/error.rs index 89a90de..d037ebb 100644 --- a/divvunspell/src/archive/error.rs +++ b/divvunspell/src/archive/error.rs @@ -1,3 +1,4 @@ +//! Archive-related errors. use std::{ffi::OsString, io::Error}; #[cfg(feature = "gpt2")] diff --git a/divvunspell/src/archive/meta.rs b/divvunspell/src/archive/meta.rs index 9d6b47c..e6588d9 100644 --- a/divvunspell/src/archive/meta.rs +++ b/divvunspell/src/archive/meta.rs @@ -1,3 +1,4 @@ +//! Archive metadata handling use serde::{Deserialize, Serialize}; use serde_xml_rs::{from_reader, Error, ParserConfig}; diff --git a/divvunspell/src/archive/mod.rs b/divvunspell/src/archive/mod.rs index cea3da4..99f1f5f 100644 --- a/divvunspell/src/archive/mod.rs +++ b/divvunspell/src/archive/mod.rs @@ -1,3 +1,4 @@ +//! Handling of archives of spell-checking models. use memmap2::Mmap; use std::{ffi::OsString, path::Path, sync::Arc}; @@ -38,24 +39,33 @@ impl MmapRef { } } +/// Speller archive is a file read into spell-checker with metadata. pub trait SpellerArchive { + /// Read and parse a speller archive. fn open(path: &Path) -> Result where Self: Sized; + /// retrieve spell-checker. fn speller(&self) -> Arc; + /// retrieve metadata. fn metadata(&self) -> Option<&SpellerMetadata>; } +/// Predictor archive is a file read intoo a predictor with metadata. pub trait PredictorArchive { + /// Read and parse a predictor archive. fn open(path: &Path, predictor_name: Option<&str>) -> Result where Self: Sized; + /// Retrieve predictor. fn predictor(&self) -> Arc; + /// retrieve metadata. fn metadata(&self) -> Option<&PredictorMetadata>; } +/// Reads a speller archive. pub fn open

(path: P) -> Result, SpellerArchiveError> where P: AsRef, diff --git a/divvunspell/src/archive/zip.rs b/divvunspell/src/archive/zip.rs index 939cbd5..5c763cd 100644 --- a/divvunspell/src/archive/zip.rs +++ b/divvunspell/src/archive/zip.rs @@ -1,3 +1,4 @@ +//! Zip archive stuff. use ::zip::{CompressionMethod, ZipArchive}; use memmap2::MmapOptions; use std::fs::File; diff --git a/divvunspell/src/lib.rs b/divvunspell/src/lib.rs index d50d65e..7fddde2 100644 --- a/divvunspell/src/lib.rs +++ b/divvunspell/src/lib.rs @@ -1,3 +1,32 @@ +/*! Spell-checking and correction with Finite-State Automata. + +Implements spell-checking and correction using weighted finite-state +automata. The automata can be compiled using [`HFST`], +this library is originally based on C++ code in [`HFST +ospell`] + +[`HFST`]: (https://hfst.github.io) +[`HFST ospell`]: (https://github.com/hfst/hfst-ospell) + +# Usage examples + +``` +use divvunspell::archive::ZipSpellerArchive + +let path = Path(); +let speller = ZipSpellerArchive::open(path) +let cfg = SpellerConfig::default(); +let words = vec!("words", "schmords"); +todo! +``` + +Further examples of how to use divvunspell library can be found in the +[`divvunspell-bin`] in the same repository. + +[`divvunspell-bin`]: (https://github.com/divvun/divvunspell) + +*/ + #![warn(missing_docs)] pub mod archive; #[cfg(feature = "internal_ffi")] diff --git a/divvunspell/src/paths.rs b/divvunspell/src/paths.rs index b9d5576..ed84a3b 100644 --- a/divvunspell/src/paths.rs +++ b/divvunspell/src/paths.rs @@ -1,3 +1,4 @@ +//! Handling of system paths containing spell-checkers on different OS. #[cfg(target_os = "macos")] use std::path::PathBuf; #[cfg(target_os = "windows")] diff --git a/divvunspell/src/predictor/mod.rs b/divvunspell/src/predictor/mod.rs index 5461c3b..672ccfa 100644 --- a/divvunspell/src/predictor/mod.rs +++ b/divvunspell/src/predictor/mod.rs @@ -1,3 +1,4 @@ +//! Autocorrect type spell-checking that predicts next word. #[cfg(feature = "gpt2")] pub mod gpt2; diff --git a/divvunspell/src/speller/mod.rs b/divvunspell/src/speller/mod.rs index 0a1c360..dc8f17d 100644 --- a/divvunspell/src/speller/mod.rs +++ b/divvunspell/src/speller/mod.rs @@ -1,3 +1,4 @@ +//! Speller model for spell-checking and corrections. use std::f32; use std::sync::Arc; diff --git a/divvunspell/src/speller/suggestion.rs b/divvunspell/src/speller/suggestion.rs index 6ac9500..a331f68 100644 --- a/divvunspell/src/speller/suggestion.rs +++ b/divvunspell/src/speller/suggestion.rs @@ -1,3 +1,4 @@ +//! Suggestion for a spelling correction. use crate::types::Weight; use serde::{Deserialize, Serialize}; use smol_str::SmolStr; diff --git a/divvunspell/src/tokenizer/mod.rs b/divvunspell/src/tokenizer/mod.rs index d714e88..6b59e9c 100644 --- a/divvunspell/src/tokenizer/mod.rs +++ b/divvunspell/src/tokenizer/mod.rs @@ -1,3 +1,4 @@ +//! Tokenizer splits strings into words and punctuations. use unic_ucd_common::alphanumeric::is_alphanumeric; use word::{WordBoundIndices, Words}; diff --git a/divvunspell/src/transducer/hfst/mod.rs b/divvunspell/src/transducer/hfst/mod.rs index c7ec9b3..2dba15a 100644 --- a/divvunspell/src/transducer/hfst/mod.rs +++ b/divvunspell/src/transducer/hfst/mod.rs @@ -1,3 +1,4 @@ +//! Finite-state automaton in HFST format. pub mod alphabet; pub mod header; pub mod index_table; diff --git a/divvunspell/src/transducer/mod.rs b/divvunspell/src/transducer/mod.rs index 9be87a7..b8fc20d 100644 --- a/divvunspell/src/transducer/mod.rs +++ b/divvunspell/src/transducer/mod.rs @@ -1,3 +1,10 @@ +//! Transducer is a Finite-State Automaton with two tapes / two symbols per +//! transition. +//! +//! Transducer in divvunspell is modeled after the C++ transducer in the +//! hfst-ospell library. It may contain some complex optimisations and +//! specifics to underlying finite-state systems and lot of this is +//! pretty hacky. pub mod hfst; pub mod thfst; @@ -11,17 +18,22 @@ use self::symbol_transition::SymbolTransition; use crate::types::{SymbolNumber, TransitionTableIndex, Weight}; use crate::vfs::{self, Filesystem}; +/// Error with transducer reading or processing. #[derive(Debug, thiserror::Error)] pub enum TransducerError { + /// Error with mmapping #[error("Memory mapping error")] Memmap(#[source] std::io::Error), + /// Error with input/output. #[error("IO error")] Io(#[source] std::io::Error), + /// Error with FSA alphabets. #[error("Alphabet error")] Alphabet(#[source] Box), } impl TransducerError { + /// Wrap into i/o error. pub fn into_io_error(self) -> std::io::Error { match self { TransducerError::Memmap(v) => v, @@ -33,53 +45,76 @@ impl TransducerError { } } +/// A file-based finite-state transducer. pub trait Transducer: Sized { + /// file extension. const FILE_EXT: &'static str; + /// read a transducer from a file. fn from_path(fs: &FS, path: P) -> Result where P: AsRef, FS: Filesystem; + /// get transducer's alphabet. fn alphabet(&self) -> &TransducerAlphabet; + /// get transducer's alphabet as mutable reference. fn mut_alphabet(&mut self) -> &mut TransducerAlphabet; + /// get input symbol number of given transition arc. fn transition_input_symbol(&self, i: TransitionTableIndex) -> Option; + /// check if there are transitions at given index. fn has_transitions(&self, i: TransitionTableIndex, s: Option) -> bool; + /// get next transition with a symbol. fn next(&self, i: TransitionTableIndex, symbol: SymbolNumber) -> Option; + /// check if there are free transitions at index. fn has_epsilons_or_flags(&self, i: TransitionTableIndex) -> bool; + /// follow free transitions. fn take_epsilons_and_flags(&self, i: TransitionTableIndex) -> Option; + /// follow epsilon transitions. fn take_epsilons(&self, i: TransitionTableIndex) -> Option; + /// follow transitions with given symbol. fn take_non_epsilons( &self, i: TransitionTableIndex, symbol: SymbolNumber, ) -> Option; + /// check if given index is an end state. fn is_final(&self, i: TransitionTableIndex) -> bool; + /// get end state weight of a state. fn final_weight(&self, i: TransitionTableIndex) -> Option; } +/// Transition table contains the arcs of the automaton (and states). pub trait TransitionTable: Sized { + /// read transition table from a file. fn from_path(fs: &FS, path: P) -> Result where P: AsRef, FS: Filesystem; + /// get input symbol of a transition. fn input_symbol(&self, i: TransitionTableIndex) -> Option; + /// get output symbol of a transition. fn output_symbol(&self, i: TransitionTableIndex) -> Option; + /// get the target state in the index. fn target(&self, i: TransitionTableIndex) -> Option; + /// get the weight of the transition. fn weight(&self, i: TransitionTableIndex) -> Option; + /// check if the state is a final state. #[inline(always)] fn is_final(&self, i: TransitionTableIndex) -> bool { self.input_symbol(i) == None && self.output_symbol(i) == None && self.target(i) == Some(1) } + /// ??? #[inline(always)] fn symbol_transition(&self, i: TransitionTableIndex) -> SymbolTransition { SymbolTransition::new(self.target(i), self.output_symbol(i), self.weight(i)) } } +/// Index table contains something. pub trait IndexTable: Sized { fn from_path(fs: &FS, path: P) -> Result where diff --git a/divvunspell/src/transducer/thfst/mod.rs b/divvunspell/src/transducer/thfst/mod.rs index 7778994..44dd0c4 100644 --- a/divvunspell/src/transducer/thfst/mod.rs +++ b/divvunspell/src/transducer/thfst/mod.rs @@ -1,3 +1,4 @@ +//! Finite-state automaton in optimised mmapped format. // We manually ensure alignment of reads in this file. #![allow(clippy::cast_ptr_alignment)] diff --git a/divvunspell/src/vfs.rs b/divvunspell/src/vfs.rs index fc7397d..c3c2c38 100644 --- a/divvunspell/src/vfs.rs +++ b/divvunspell/src/vfs.rs @@ -1,3 +1,4 @@ +//! Some stuff for filesystems and different OSes. use fs_extra::dir::CopyOptions; use memmap2::{Mmap, MmapOptions}; use std::fmt::Debug; @@ -56,6 +57,7 @@ impl File for std::fs::File { } } +/// File system. pub struct Fs; impl Filesystem for Fs { @@ -74,6 +76,7 @@ impl Filesystem for Fs { } } +/// Box file. pub mod boxf { use box_format::{BoxFileReader, BoxPath}; use std::io::{Read, Result};