Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

prepare for multiple fastfield codecs #1063

Merged
merged 3 commits into from
May 31, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/collector/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use super::*;
use crate::core::SegmentReader;
use crate::fastfield::BytesFastFieldReader;
use crate::fastfield::DynamicFastFieldReader;
use crate::fastfield::{FastFieldReader, MultiValueLength};
use crate::fastfield::FastFieldReader;
use crate::schema::Field;
use crate::DocId;
use crate::Score;
Expand Down
7 changes: 5 additions & 2 deletions src/fastfield/bytes/writer.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
use std::io;

use crate::fastfield::serializer::FastFieldSerializer;
use crate::schema::{Document, Field, Value};
use crate::DocId;
use crate::{fastfield::serializer::FastFieldSerializer, indexer::doc_id_mapping::DocIdMapping};
use crate::{
fastfield::serializer::CompositeFastFieldSerializer, indexer::doc_id_mapping::DocIdMapping,
};

/// Writer for byte array (as in, any number of bytes per document) fast fields
///
Expand Down Expand Up @@ -104,7 +107,7 @@ impl BytesFastFieldWriter {
/// Serializes the fast field values by pushing them to the `FastFieldSerializer`.
pub fn serialize(
&self,
serializer: &mut FastFieldSerializer,
serializer: &mut CompositeFastFieldSerializer,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
// writing the offset index
Expand Down
15 changes: 8 additions & 7 deletions src/fastfield/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ pub use self::reader::BitpackedFastFieldReader;
pub use self::reader::DynamicFastFieldReader;
pub use self::reader::FastFieldReader;
pub use self::readers::FastFieldReaders;
pub use self::serializer::CompositeFastFieldSerializer;
pub use self::serializer::FastFieldSerializer;
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
use crate::schema::Cardinality;
Expand Down Expand Up @@ -256,7 +257,7 @@ mod tests {
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
fast_field_writers.add_document(&doc!(*FIELD=>13u64));
fast_field_writers.add_document(&doc!(*FIELD=>14u64));
Expand All @@ -283,7 +284,7 @@ mod tests {
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test"))?;
let mut serializer = FastFieldSerializer::from_write(write)?;
let mut serializer = CompositeFastFieldSerializer::from_write(write)?;
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
fast_field_writers.add_document(&doc!(*FIELD=>4u64));
fast_field_writers.add_document(&doc!(*FIELD=>14_082_001u64));
Expand Down Expand Up @@ -323,7 +324,7 @@ mod tests {

{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for _ in 0..10_000 {
fast_field_writers.add_document(&doc!(*FIELD=>100_000u64));
Expand Down Expand Up @@ -353,7 +354,7 @@ mod tests {

{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
// forcing the amplitude to be high
fast_field_writers.add_document(&doc!(*FIELD=>0u64));
Expand Down Expand Up @@ -392,7 +393,7 @@ mod tests {
let schema = schema_builder.build();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
for i in -100i64..10_000i64 {
let mut doc = Document::default();
Expand Down Expand Up @@ -435,7 +436,7 @@ mod tests {

{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
let doc = Document::default();
fast_field_writers.add_document(&doc);
Expand Down Expand Up @@ -470,7 +471,7 @@ mod tests {
let directory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test"))?;
let mut serializer = FastFieldSerializer::from_write(write)?;
let mut serializer = CompositeFastFieldSerializer::from_write(write)?;
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x));
Expand Down
9 changes: 5 additions & 4 deletions src/fastfield/multivalued/writer.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use crate::fastfield::serializer::FastSingleFieldSerializer;
use crate::fastfield::FastFieldSerializer;
use crate::fastfield::serializer::DynamicFastFieldSerializer;
use crate::fastfield::serializer::FastFieldSerializer;
use crate::fastfield::CompositeFastFieldSerializer;
use crate::postings::UnorderedTermId;
use crate::schema::{Document, Field};
use crate::termdict::TermOrdinal;
Expand Down Expand Up @@ -134,7 +135,7 @@ impl MultiValuedFastFieldWriter {
///
pub fn serialize(
&self,
serializer: &mut FastFieldSerializer,
serializer: &mut CompositeFastFieldSerializer,
mapping_opt: Option<&FnvHashMap<UnorderedTermId, TermOrdinal>>,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
Expand All @@ -154,7 +155,7 @@ impl MultiValuedFastFieldWriter {
}
{
// writing the values themselves.
let mut value_serializer: FastSingleFieldSerializer<'_, _>;
let mut value_serializer: DynamicFastFieldSerializer<'_, _>;
match mapping_opt {
Some(mapping) => {
value_serializer = serializer.new_u64_fast_field_with_idx(
Expand Down
4 changes: 2 additions & 2 deletions src/fastfield/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use crate::common::CompositeFile;
use crate::directory::FileSlice;
use crate::directory::OwnedBytes;
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::fastfield::{FastFieldSerializer, FastFieldsWriter};
use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter};
use crate::schema::Schema;
use crate::schema::FAST;
use crate::DocId;
Expand Down Expand Up @@ -209,7 +209,7 @@ impl<Item: FastValue> From<Vec<Item>> for BitpackedFastFieldReader<Item> {
let write: WritePtr = directory
.open_write(path)
.expect("With a RamDirectory, this should never fail.");
let mut serializer = FastFieldSerializer::from_write(write)
let mut serializer = CompositeFastFieldSerializer::from_write(write)
.expect("With a RamDirectory, this should never fail.");
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
{
Expand Down
146 changes: 130 additions & 16 deletions src/fastfield/serializer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ use std::io::{self, Write};
use tantivy_bitpacker::compute_num_bits;
use tantivy_bitpacker::BitPacker;

/// `FastFieldSerializer` is in charge of serializing
/// `CompositeFastFieldSerializer` is in charge of serializing
/// fastfields on disk.
///
/// Fast fields are encoded using bit-packing.
/// Fast fields have differnt encodings like bit-packing.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo

///
/// `FastFieldWriter`s are in charge of pushing the data to
/// the serializer.
Expand All @@ -27,16 +27,16 @@ use tantivy_bitpacker::BitPacker;
/// * ...
/// * `close_field()`
/// * `close()`
pub struct FastFieldSerializer {
pub struct CompositeFastFieldSerializer {
composite_write: CompositeWrite<WritePtr>,
}

impl FastFieldSerializer {
impl CompositeFastFieldSerializer {
/// Constructor
pub fn from_write(write: WritePtr) -> io::Result<FastFieldSerializer> {
pub fn from_write(write: WritePtr) -> io::Result<CompositeFastFieldSerializer> {
// just making room for the pointer to header.
let composite_write = CompositeWrite::wrap(write);
Ok(FastFieldSerializer { composite_write })
Ok(CompositeFastFieldSerializer { composite_write })
}

/// Start serializing a new u64 fast field
Expand All @@ -45,7 +45,7 @@ impl FastFieldSerializer {
field: Field,
min_value: u64,
max_value: u64,
) -> io::Result<FastSingleFieldSerializer<'_, CountingWriter<WritePtr>>> {
) -> io::Result<DynamicFastFieldSerializer<'_, CountingWriter<WritePtr>>> {
self.new_u64_fast_field_with_idx(field, min_value, max_value, 0)
}

Expand All @@ -56,9 +56,9 @@ impl FastFieldSerializer {
min_value: u64,
max_value: u64,
idx: usize,
) -> io::Result<FastSingleFieldSerializer<'_, CountingWriter<WritePtr>>> {
) -> io::Result<DynamicFastFieldSerializer<'_, CountingWriter<WritePtr>>> {
let field_write = self.composite_write.for_field_with_idx(field, idx);
FastSingleFieldSerializer::open(field_write, min_value, max_value)
DynamicFastFieldSerializer::open(field_write, min_value, max_value)
}

/// Start serializing a new [u8] fast field
Expand All @@ -79,14 +79,111 @@ impl FastFieldSerializer {
}
}

pub struct FastSingleFieldSerializer<'a, W: Write> {
#[derive(Debug, Clone)]
pub struct EstimationStats {
min_value: u64,
max_value: u64,
}
/// The FastFieldSerializer trait is the common interface
/// implemented by every fastfield serializer variant.
///
/// `DynamicFastFieldSerializer` is the enum wrapping all variants.
/// It is used to create an serializer instance.
pub trait FastFieldSerializer {
/// add value to serializer
fn add_val(&mut self, val: u64) -> io::Result<()>;
/// finish serializing a field.
fn close_field(self) -> io::Result<()>;
}

/// The FastFieldSerializerEstimate trait is required on all variants
/// of fast field compressions, to decide which one to choose.
pub trait FastFieldSerializerEstimate {
/// returns an estimate of the compression ratio.
fn estimate(
/*fastfield_accessor: impl FastFieldReader<u64>,*/ stats: EstimationStats,
) -> (f32, &'static str);
/// the unique name of the compressor
fn name() -> &'static str;
}

pub enum DynamicFastFieldSerializer<'a, W: Write> {
Bitpacked(BitpackedFastFieldSerializer<'a, W>),
}

impl<'a, W: Write> DynamicFastFieldSerializer<'a, W> {
/// Creates a new fast field serializer.
///
/// The serializer in fact encode the values by bitpacking
/// `(val - min_value)`.
///
/// It requires a `min_value` and a `max_value` to compute
/// compute the minimum number of bits required to encode
/// values.
pub fn open(
write: &'a mut W,
min_value: u64,
max_value: u64,
) -> io::Result<DynamicFastFieldSerializer<'a, W>> {
let stats = EstimationStats {
min_value,
max_value,
};
let (_ratio, name) = (
BitpackedFastFieldSerializer::<Vec<u8>>::estimate(stats),
BitpackedFastFieldSerializer::<Vec<u8>>::name(),
);
Self::open_from_name(write, min_value, max_value, name)
}

/// Creates a new fast field serializer.
///
/// The serializer in fact encode the values by bitpacking
/// `(val - min_value)`.
///
/// It requires a `min_value` and a `max_value` to compute
/// compute the minimum number of bits required to encode
/// values.
pub fn open_from_name(
write: &'a mut W,
min_value: u64,
max_value: u64,
name: &str,
) -> io::Result<DynamicFastFieldSerializer<'a, W>> {
// Weirdly the W generic on BitpackedFastFieldSerializer needs to be set,
// although name() doesn't use it
let variant = if name == BitpackedFastFieldSerializer::<Vec<u8>>::name() {
DynamicFastFieldSerializer::Bitpacked(BitpackedFastFieldSerializer::open(
write, min_value, max_value,
)?)
} else {
panic!("unknown fastfield serializer {}", name);
};

Ok(variant)
}
}
impl<'a, W: Write> FastFieldSerializer for DynamicFastFieldSerializer<'a, W> {
fn add_val(&mut self, val: u64) -> io::Result<()> {
match self {
Self::Bitpacked(serializer) => serializer.add_val(val),
}
}
fn close_field(self) -> io::Result<()> {
match self {
Self::Bitpacked(serializer) => serializer.close_field(),
}
}
}

pub struct BitpackedFastFieldSerializer<'a, W: Write> {
bit_packer: BitPacker,
write: &'a mut W,
min_value: u64,
num_bits: u8,
}

impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
impl<'a, W: Write> BitpackedFastFieldSerializer<'a, W> {
/// Creates a new fast field serializer.
///
/// The serializer in fact encode the values by bitpacking
Expand All @@ -99,34 +196,51 @@ impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
write: &'a mut W,
min_value: u64,
max_value: u64,
) -> io::Result<FastSingleFieldSerializer<'a, W>> {
) -> io::Result<BitpackedFastFieldSerializer<'a, W>> {
assert!(min_value <= max_value);
min_value.serialize(write)?;
let amplitude = max_value - min_value;
amplitude.serialize(write)?;
let num_bits = compute_num_bits(amplitude);
let bit_packer = BitPacker::new();
Ok(FastSingleFieldSerializer {
Ok(BitpackedFastFieldSerializer {
bit_packer,
write,
min_value,
num_bits,
})
}
}

impl<'a, W: 'a + Write> FastFieldSerializer for BitpackedFastFieldSerializer<'a, W> {
/// Pushes a new value to the currently open u64 fast field.
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
fn add_val(&mut self, val: u64) -> io::Result<()> {
let val_to_write: u64 = val - self.min_value;
self.bit_packer
.write(val_to_write, self.num_bits, &mut self.write)?;
Ok(())
}

pub fn close_field(mut self) -> io::Result<()> {
fn close_field(mut self) -> io::Result<()> {
self.bit_packer.close(&mut self.write)
}
}

impl<'a, W: 'a + Write> FastFieldSerializerEstimate for BitpackedFastFieldSerializer<'a, W> {
fn estimate(
/*_fastfield_accessor: impl FastFieldReader<u64>, */ stats: EstimationStats,
) -> (f32, &'static str) {
let amplitude = stats.max_value - stats.min_value;
let num_bits = compute_num_bits(amplitude);
let num_bits_uncompressed = 64;
let ratio = num_bits as f32 / num_bits_uncompressed as f32;
let name = Self::name();
(ratio, name)
}
fn name() -> &'static str {
"Bitpacked"
}
}

pub struct FastBytesFieldSerializer<'a, W: Write> {
write: &'a mut W,
}
Expand Down
7 changes: 4 additions & 3 deletions src/fastfield/writer.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use super::multivalued::MultiValuedFastFieldWriter;
use crate::common;
use crate::fastfield::{BytesFastFieldWriter, FastFieldSerializer};
use crate::fastfield::serializer::FastFieldSerializer;
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema};
Expand Down Expand Up @@ -148,7 +149,7 @@ impl FastFieldsWriter {
/// order to the fast field serializer.
pub fn serialize(
&self,
serializer: &mut FastFieldSerializer,
serializer: &mut CompositeFastFieldSerializer,
mapping: &HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
Expand Down Expand Up @@ -272,7 +273,7 @@ impl IntFastFieldWriter {
/// Push the fast fields value to the `FastFieldWriter`.
pub fn serialize(
&self,
serializer: &mut FastFieldSerializer,
serializer: &mut CompositeFastFieldSerializer,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
let (min, max) = if self.val_min > self.val_max {
Expand Down
Loading