Skip to content

Commit

Permalink
api: make FSTs generic over AsRef<[u8]>
Browse files Browse the repository at this point in the history
This change was not nearly as bad as I thought it would be. Instead of
trying to provide all the different possible ways to store some bytes,
we instead make FSTs maximally flexible by accepting any type that can
cheaply provide byte slice. This should resolve a number of issues with
folks constructing FSTs in ways that weren't supported by the old
constructors.

As a bonus, we no longer need to directly depend on a specific
implementation of memory maps. Conveniently, the `Mmap` type in the
`memmap` crate already implements `AsRef<[u8]>`, so using memory maps is
as simple as

    let mmap = memmap::Mmap::map(&File::open("something.fst").unwrap());
    let fst = Fst::new(mmap).unwrap();

Fixes #92, Fixes #94, Fixes #97
  • Loading branch information
BurntSushi committed Mar 6, 2020
1 parent 3ace3b6 commit 423a0c7
Show file tree
Hide file tree
Showing 26 changed files with 346 additions and 505 deletions.
5 changes: 2 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,17 @@ edition = "2018"
members = ["bench", "fst-bin"]

[features]
default = ["mmap"]
mmap = ["memmap"]
default = []
levenshtein = ["utf8-ranges"]

[dependencies]
byteorder = "1"
memmap = { version = "0.6.0", optional = true }
utf8-ranges = { version = "1", optional = true }

[dev-dependencies]
fnv = "1.0.5"
lazy_static = "0.2.8"
memmap = "0.7"
quickcheck = { version = "0.7", default-features = false }
rand = "0.5"

Expand Down
1 change: 1 addition & 0 deletions fst-bin/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ csv = "1.1.3"
docopt = "1.1"
fst = { path = "..", version = "0.3", features = ["levenshtein"] }
lines = "0.0"
memmap = "0.7"
num_cpus = "1.5"
regex-automata = { version = "*", path = "/home/andrew/rust/regex-automata", features = ["fst1"] }
serde = { version = "1.0.104", features = ["derive"] }
Expand Down
2 changes: 1 addition & 1 deletion fst-bin/src/cmd/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ pub fn run(argv: Vec<String>) -> Result<(), Error> {
let wtr = util::get_writer(args.arg_output.as_ref())?;
let mut wtr = csv::Writer::from_writer(wtr);

let fst = unsafe { fst::raw::Fst::from_path(args.arg_input) }?;
let fst = unsafe { util::mmap_fst(args.arg_input)? };
let mut set = BitSet::with_capacity(fst.len());

if args.cmd_edges {
Expand Down
2 changes: 1 addition & 1 deletion fst-bin/src/cmd/dot.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ pub fn run(argv: Vec<String>) -> Result<(), Error> {
.unwrap_or_else(|e| e.exit());

let mut wtr = util::get_buf_writer(args.arg_output.as_ref())?;
let fst = unsafe { fst::raw::Fst::from_path(&args.arg_input) }?;
let fst = unsafe { util::mmap_fst(&args.arg_input)? };
let mut set = BitSet::with_capacity(fst.len());

let mut stack = vec![fst.root().addr()];
Expand Down
2 changes: 1 addition & 1 deletion fst-bin/src/cmd/dupes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ pub fn run(argv: Vec<String>) -> Result<(), Error> {
.unwrap_or_else(|e| e.exit());

let mut wtr = util::get_buf_writer(args.arg_output.as_ref())?;
let fst = unsafe { fst::raw::Fst::from_path(args.arg_input) }?;
let fst = unsafe { util::mmap_fst(args.arg_input)? };
let mut set = BitSet::with_capacity(fst.len());
let mut node_counts = HashMap::with_capacity(10_000);

Expand Down
3 changes: 1 addition & 2 deletions fst-bin/src/cmd/fuzzy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ use std::io;

use docopt::Docopt;
use fst::automaton::Levenshtein;
use fst::raw::Fst;
use serde::Deserialize;

use crate::util;
Expand Down Expand Up @@ -48,7 +47,7 @@ pub fn run(argv: Vec<String>) -> Result<(), Error> {
let args: Args = Docopt::new(USAGE)
.and_then(|d| d.argv(&argv).deserialize())
.unwrap_or_else(|e| e.exit());
let fst = unsafe { Fst::from_path(&args.arg_fst) }?;
let fst = unsafe { util::mmap_fst(&args.arg_fst)? };
let lev = Levenshtein::new(&args.arg_query, args.flag_distance)?;
let mut q = fst.search(&lev);
if let Some(ref start) = args.flag_start {
Expand Down
8 changes: 1 addition & 7 deletions fst-bin/src/cmd/grep.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
use std::io;

use docopt::Docopt;
use fst::raw::Fst;
use regex_automata::dense;
// use regex_automata::sparse::SparseDFA;
use serde::Deserialize;

use crate::util;
Expand Down Expand Up @@ -40,7 +38,7 @@ pub fn run(argv: Vec<String>) -> Result<(), Error> {
let args: Args = Docopt::new(USAGE)
.and_then(|d| d.argv(&argv).deserialize())
.unwrap_or_else(|e| e.exit());
let fst = unsafe { Fst::from_path(&args.arg_fst) }?;
let fst = unsafe { util::mmap_fst(&args.arg_fst)? };
let dense_dfa = dense::Builder::new()
.anchored(true)
.byte_classes(false)
Expand All @@ -50,10 +48,6 @@ pub fn run(argv: Vec<String>) -> Result<(), Error> {
dense::DenseDFA::Standard(dfa) => dfa,
_ => unreachable!(),
};
// let dfa = match dense_dfa.to_sparse()? {
// SparseDFA::Standard(dfa) => dfa,
// _ => unreachable!(),
// };
let mut q = fst.search(&dfa);
if let Some(ref start) = args.flag_start {
q = q.ge(start);
Expand Down
16 changes: 0 additions & 16 deletions fst-bin/src/cmd/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,3 @@ pub mod range;
pub mod rust;
pub mod set;
pub mod union;

// If compile times become unruly, comment out unused modules above and use
// the following macro to satisfying the compiler.
// macro_rules! unused {
// ($($name:ident),*) => {
// $(
// pub mod $name {
// pub fn run(_: Vec<String>) -> Result<(), ::Error> {
// unimplemented!()
// }
// }
// )*
// }
// }
//
// unused! { csv, dot, fuzzy, grep, map, union }
3 changes: 1 addition & 2 deletions fst-bin/src/cmd/node.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use docopt::Docopt;
use fst::raw::Fst;
use serde::Deserialize;

use crate::util;
Expand Down Expand Up @@ -34,7 +33,7 @@ pub fn run(argv: Vec<String>) -> Result<(), Error> {
.and_then(|d| d.argv(&argv).deserialize())
.unwrap_or_else(|e| e.exit());
let mut wtr = util::get_buf_writer::<&str>(None)?;
let fst = unsafe { Fst::from_path(&args.arg_fst) }?;
let fst = unsafe { util::mmap_fst(&args.arg_fst)? };
let node = fst.node(args.arg_node_address);
w!(wtr, "{:?}", node);
Ok(())
Expand Down
3 changes: 1 addition & 2 deletions fst-bin/src/cmd/range.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use std::io;

use docopt::Docopt;
use fst::raw::Fst;
use serde::Deserialize;

use crate::util;
Expand Down Expand Up @@ -38,7 +37,7 @@ pub fn run(argv: Vec<String>) -> Result<(), Error> {
let args: Args = Docopt::new(USAGE)
.and_then(|d| d.argv(&argv).deserialize())
.unwrap_or_else(|e| e.exit());
let fst = unsafe { Fst::from_path(&args.arg_fst) }?;
let fst = unsafe { util::mmap_fst(&args.arg_fst)? };
let mut q = fst.range();
if let Some(ref start) = args.flag_start {
q = q.ge(start);
Expand Down
6 changes: 1 addition & 5 deletions fst-bin/src/cmd/rust.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,7 @@ pub fn run(argv: Vec<String>) -> Result<(), Error> {

w!(wtr, "lazy_static! {{");
w!(wtr, " pub static ref {}: ::fst::raw::Fst = ", args.arg_name);
w!(
wtr,
" ::fst::raw::Fst::from_static_slice({}_BYTES).unwrap();",
args.arg_name
);
w!(wtr, " ::fst::raw::Fst::new({}_BYTES).unwrap();", args.arg_name);
w!(wtr, "}}\n");

w!(wtr, "const {}_BYTES: &'static [u8] = b\"\\", args.arg_name);
Expand Down
3 changes: 2 additions & 1 deletion fst-bin/src/cmd/union.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ pub fn run(argv: Vec<String>) -> Result<(), Error> {

let mut sets = vec![];
for set_path in &args.arg_input {
sets.push(unsafe { fst::Set::from_path(set_path) }?);
let fst = unsafe { util::mmap_fst(set_path)? };
sets.push(fst::Set::from(fst));
}
let union = sets.iter().collect::<fst::set::OpBuilder>().union();
merged.extend_stream(union)?;
Expand Down
42 changes: 25 additions & 17 deletions fst-bin/src/merge.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@ use std::sync::Arc;
use std::thread;

use chan;
use fst::{self, Streamer, raw};
use fst::{self, raw, Streamer};
use num_cpus;
use tempdir::TempDir;

use crate::util;
use crate::Error;

pub struct Merger<I> {
Expand All @@ -29,10 +30,14 @@ pub struct Merger<I> {
type KV = (String, u64);

impl<I> Merger<I>
where I: Iterator<Item=Result<(String, u64), Error>> + Send + 'static {
where
I: Iterator<Item = Result<(String, u64), Error>> + Send + 'static,
{
pub fn new<T, P>(it: T, output: P) -> Self
where P: AsRef<Path>,
T: IntoIterator<IntoIter=I, Item=I::Item> {
where
P: AsRef<Path>,
T: IntoIterator<IntoIter = I, Item = I::Item>,
{
Merger {
it: it.into_iter(),
output: output.as_ref().to_path_buf(),
Expand All @@ -46,7 +51,9 @@ where I: Iterator<Item=Result<(String, u64), Error>> + Send + 'static {
}

pub fn value_merger<F>(mut self, f: F) -> Self
where F: Fn(u64, u64) -> u64 + Send + Sync + 'static {
where
F: Fn(u64, u64) -> u64 + Send + Sync + 'static,
{
self.value_merger = Some(Arc::new(f));
self
}
Expand Down Expand Up @@ -132,9 +139,11 @@ fn batcher<I, T, IT>(
batch_size: u32,
threads: u32,
) -> chan::Receiver<Result<Vec<T>, Error>>
where T: Send + 'static,
IT: Iterator<Item=Result<T, Error>> + Send + 'static,
I: IntoIterator<IntoIter=IT, Item=IT::Item> + Send + 'static {
where
T: Send + 'static,
IT: Iterator<Item = Result<T, Error>> + Send + 'static,
I: IntoIterator<IntoIter = IT, Item = IT::Item> + Send + 'static,
{
let batch_size = batch_size as usize;
let (send, recv) = chan::sync(cmp::min(1, threads as usize / 3));
let it = it.into_iter();
Expand Down Expand Up @@ -182,10 +191,7 @@ impl<B: Batchable + Send + 'static> Sorters<B> {
rsend.send(results);
});
}
Sorters {
send: bsend,
results: rrecv,
}
Sorters { send: bsend, results: rrecv }
}

fn create_fst(&self, batch: B) {
Expand Down Expand Up @@ -223,7 +229,9 @@ impl Batchable for KvBatch {
for &(ref k, v) in &self.kvs {
match builder.insert(k, v) {
Ok(_) => {}
Err(fst::Error::Fst(fst::raw::Error::DuplicateKey { .. })) => {}
Err(fst::Error::Fst(fst::raw::Error::DuplicateKey {
..
})) => {}
Err(err) => return Err(From::from(err)),
}
}
Expand All @@ -249,15 +257,15 @@ impl Batchable for UnionBatch {

let mut fsts = vec![];
for path in &self.fsts {
fsts.push(unsafe { raw::Fst::from_path(path) }?);
fsts.push(unsafe { util::mmap_fst(path)? });
}
let mut union = fsts.iter().collect::<raw::OpBuilder>().union();
while let Some((key, outputs)) = union.next() {
let mut merged = 0;
if let Some(ref value_merger) = self.value_merger {
merged = outputs.iter().fold(0, |merged, iv| {
value_merger(merged, iv.value)
});
merged = outputs
.iter()
.fold(0, |merged, iv| value_merger(merged, iv.value));
}
builder.insert(key, merged)?;
}
Expand Down
20 changes: 15 additions & 5 deletions fst-bin/src/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,18 @@ use std::io::{self, BufRead};
use std::path::{Path, PathBuf};

use csv;
use fst::raw::{Fst, Output};
use fst::{IntoStreamer, Streamer};
use fst::raw::Output;
use memmap::Mmap;

use crate::Error;

pub unsafe fn mmap_fst<P: AsRef<Path>>(path: P) -> Result<Fst<Mmap>, Error> {
let mmap = Mmap::map(&File::open(path)?)?;
let fst = Fst::new(mmap)?;
Ok(fst)
}

pub fn escape_input(b: u8) -> String {
String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
}
Expand Down Expand Up @@ -61,9 +68,11 @@ pub fn print_stream<'f, W, I, S>(
outputs: bool,
stream: I,
) -> Result<(), Error>
where W: io::Write,
I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], Output)>,
S: 'f + for<'a> Streamer<'a, Item=(&'a [u8], Output)> {
where
W: io::Write,
I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
{
let mut stream = stream.into_stream();
if outputs {
let mut wtr = csv::Writer::from_writer(wtr);
Expand All @@ -86,7 +95,8 @@ pub struct ConcatLines {
cur: Option<Lines>,
}

type Lines = io::Lines<io::BufReader<Box<dyn io::Read + Send + Sync + 'static>>>;
type Lines =
io::Lines<io::BufReader<Box<dyn io::Read + Send + Sync + 'static>>>;

impl ConcatLines {
pub fn new(mut inputs: Vec<PathBuf>) -> ConcatLines {
Expand Down
4 changes: 0 additions & 4 deletions fst-regex/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,6 @@ keywords = ["search", "information", "retrieval", "dictionary", "map"]
license = "Unlicense/MIT"
edition = "2018"

[features]
mmap = ["fst/mmap"]
default = ["mmap"]

[dependencies]
fst = { path = "..", version = "0.3.1", default-features = false }
regex-syntax = "0.3"
Expand Down
20 changes: 14 additions & 6 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,18 +84,24 @@ Note that an error will be returned if a Levenshtein automaton gets too big
"##
)]
/*!
# Example: stream a map to a file
# Example: stream to a file and memory map it for searching
This shows how to create a `MapBuilder` that will stream construction of the
map to a file. Notably, this will never store the entire transducer in memory.
Instead, only constant memory is required.
Instead, only constant memory is required during construction.
For the search phase, we use the
[`memmap`](https://crates.io/memmap)
crate to make the file available as a `&[u8]` without necessarily reading it
all into memory (the operating system will automatically handle that for you).
```rust,no_run
# fn example() -> Result<(), fst::Error> {
use std::fs::File;
use std::io;
use fst::{IntoStreamer, Streamer, Map, MapBuilder};
use memmap::Mmap;
// This is where we'll write our map to.
let mut wtr = io::BufWriter::new(File::create("map.fst")?);
Expand All @@ -112,7 +118,8 @@ build.finish()?;
// At this point, the map has been constructed. Now we'd like to search it.
// This creates a memory map, which enables searching the map without loading
// all of it into memory.
let map = unsafe { Map::from_path("map.fst") }?;
let mmap = unsafe { Mmap::map(&File::open("map.fst")?)? };
let map = Map::new(mmap)?;
// Query for keys that are greater than or equal to clarence.
let mut stream = map.range().ge("clarence").into_stream();
Expand Down Expand Up @@ -236,8 +243,9 @@ actually reading the entire set/map into memory. This use case is served well
by *memory maps*, which lets one assign the entire contents of a file to a
contiguous region of virtual memory.
Indeed, this crate encourages this mode of operation. Both sets and maps have
methods for memory mapping a finite state transducer from disk.
Indeed, this crate encourages this mode of operation. Both sets and maps can
be constructed from anything that provides an `AsRef<[u8]>` implementation,
which any memory map should.
This is particularly important for long running processes that use this crate,
since it enables the operating system to determine which regions of your
Expand All @@ -252,7 +260,7 @@ solid state drives where seek time is eliminated. Nevertheless, solid state
drives are not ubiquitous and it is possible that the OS will not be smart
enough to keep your memory mapped transducers in the page cache. In that case,
it is advisable to load the entire transducer into your process's memory (e.g.,
`Set::from_bytes`).
calling `Set::new` with a `Vec<u8>`).
# Streams
Expand Down
Loading

0 comments on commit 423a0c7

Please sign in to comment.