Skip to content

Commit

Permalink
Merge pull request #1097 from PSeitz/multifastfield
Browse files Browse the repository at this point in the history
Use dynamic fastfield codes for multivalues fixes #1093
  • Loading branch information
PSeitz authored Jun 30, 2021
2 parents d584975 + 3b5c1d7 commit aea2e77
Show file tree
Hide file tree
Showing 14 changed files with 370 additions and 300 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ rayon = "1.5"
lru = "0.6.5"
fastdivide = "0.3"
itertools = "0.10.0"
measure_time = "0.7.0"

[target.'cfg(windows)'.dependencies]
winapi = "0.3.9"
Expand Down
20 changes: 7 additions & 13 deletions fastfield_codecs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,14 @@ pub trait FastFieldCodecSerializer {

/// FastFieldDataAccess is the trait to access fast field data during serialization and estimation.
pub trait FastFieldDataAccess {
/// Return the value associated to the given document.
/// Return the value associated to the given position.
///
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance reasons.
///
/// # Panics
///
/// May panic if `doc` is greater than the segment
fn get(&self, doc: u32) -> u64;
/// May panic if `position` is greater than the index.
fn get_val(&self, position: u64) -> u64;
}

#[derive(Debug, Clone)]
Expand All @@ -69,20 +69,14 @@ pub struct FastFieldStats {
}

impl<'a> FastFieldDataAccess for &'a [u64] {
fn get(&self, doc: u32) -> u64 {
self[doc as usize]
}
}

impl<'a> FastFieldDataAccess for &'a Vec<u64> {
fn get(&self, doc: u32) -> u64 {
self[doc as usize]
fn get_val(&self, position: u64) -> u64 {
self[position as usize]
}
}

impl FastFieldDataAccess for Vec<u64> {
fn get(&self, doc: u32) -> u64 {
self[doc as usize]
fn get_val(&self, position: u64) -> u64 {
self[position as usize]
}
}

Expand Down
10 changes: 5 additions & 5 deletions fastfield_codecs/src/linearinterpol.rs
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,8 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
) -> io::Result<()> {
assert!(stats.min_value <= stats.max_value);

let first_val = fastfield_accessor.get(0);
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
let first_val = fastfield_accessor.get_val(0);
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
let slope = get_slope(first_val, last_val, stats.num_vals);
// calculate offset to ensure all values are positive
let mut offset = 0;
Expand Down Expand Up @@ -191,8 +191,8 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
/// where the local maxima for the deviation of the calculated value are and
/// the offset to shift all values to >=0 is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
let first_val = fastfield_accessor.get(0);
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
let first_val = fastfield_accessor.get_val(0);
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
let slope = get_slope(first_val, last_val, stats.num_vals);

// let's sample at 0%, 5%, 10% .. 95%, 100%
Expand All @@ -205,7 +205,7 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
.iter()
.map(|pos| {
let calculated_value = get_calculated_value(first_val, *pos as u64, slope);
let actual_value = fastfield_accessor.get(*pos as u32);
let actual_value = fastfield_accessor.get_val(*pos as u64);
distance(calculated_value, actual_value)
})
.max()
Expand Down
11 changes: 6 additions & 5 deletions fastfield_codecs/src/multilinearinterpol.rs
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,8 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
) -> io::Result<()> {
assert!(stats.min_value <= stats.max_value);

let first_val = fastfield_accessor.get(0);
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
let first_val = fastfield_accessor.get_val(0);
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);

let mut first_function = Function {
end_pos: stats.num_vals,
Expand Down Expand Up @@ -309,9 +309,10 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
/// where the local maxima are for the deviation of the calculated value and
/// the offset is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
let first_val_in_first_block = fastfield_accessor.get(0);
let first_val_in_first_block = fastfield_accessor.get_val(0);
let last_elem_in_first_chunk = CHUNK_SIZE.min(stats.num_vals);
let last_val_in_first_block = fastfield_accessor.get(last_elem_in_first_chunk as u32 - 1);
let last_val_in_first_block =
fastfield_accessor.get_val(last_elem_in_first_chunk as u64 - 1);
let slope = get_slope(
first_val_in_first_block,
last_val_in_first_block,
Expand All @@ -328,7 +329,7 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
.map(|pos| {
let calculated_value =
get_calculated_value(first_val_in_first_block, *pos as u64, slope);
let actual_value = fastfield_accessor.get(*pos as u32);
let actual_value = fastfield_accessor.get_val(*pos as u64);
distance(calculated_value, actual_value)
})
.max()
Expand Down
33 changes: 28 additions & 5 deletions src/core/segment_reader.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use crate::common::HasLen;
use crate::core::InvertedIndexReader;
use crate::core::Segment;
use crate::core::SegmentComponent;
Expand Down Expand Up @@ -63,11 +62,9 @@ impl SegmentReader {
self.max_doc
}

/// Returns the number of documents.
/// Returns the number of alive documents.
/// Deleted documents are not counted.
///
/// Today, `tantivy` does not handle deletes so max doc and
/// num_docs are the same.
pub fn num_docs(&self) -> DocId {
self.num_docs
}
Expand All @@ -81,7 +78,7 @@ impl SegmentReader {
/// deleted in the segment.
pub fn num_deleted_docs(&self) -> DocId {
self.delete_bitset()
.map(|delete_set| delete_set.len() as DocId)
.map(|delete_set| delete_set.num_deleted() as DocId)
.unwrap_or(0u32)
}

Expand Down Expand Up @@ -329,6 +326,32 @@ mod test {
use crate::schema::{Schema, Term, STORED, TEXT};
use crate::DocId;

#[test]
fn test_num_alive() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("name", TEXT | STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let name = schema.get_field("name").unwrap();

{
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(name => "tantivy"));
index_writer.add_document(doc!(name => "horse"));
index_writer.add_document(doc!(name => "jockey"));
index_writer.add_document(doc!(name => "cap"));
// we should now have one segment with two docs
index_writer.delete_term(Term::from_field_text(name, "horse"));
index_writer.delete_term(Term::from_field_text(name, "cap"));

// ok, now we should have a deleted doc
index_writer.commit()?;
}
let searcher = index.reader()?.searcher();
assert_eq!(2, searcher.segment_reader(0).num_docs());
assert_eq!(4, searcher.segment_reader(0).max_doc());
Ok(())
}
#[test]
fn test_alive_docs_iterator() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
Expand Down
4 changes: 4 additions & 0 deletions src/fastfield/delete.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@ impl DeleteBitSet {
b & (1u8 << shift) != 0
}

/// The number of deleted docs
pub fn num_deleted(&self) -> usize {
self.num_deleted
}
/// Summarize total space usage of this bitset.
pub fn space_usage(&self) -> ByteCount {
self.data.len()
Expand Down
14 changes: 8 additions & 6 deletions src/fastfield/multivalued/reader.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
use std::ops::Range;

use crate::fastfield::{
BitpackedFastFieldReader, DynamicFastFieldReader, FastFieldReader, FastValue, MultiValueLength,
};
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue, MultiValueLength};
use crate::DocId;

/// Reader for a multivalued `u64` fast field.
Expand All @@ -16,13 +14,13 @@ use crate::DocId;
#[derive(Clone)]
pub struct MultiValuedFastFieldReader<Item: FastValue> {
idx_reader: DynamicFastFieldReader<u64>,
vals_reader: BitpackedFastFieldReader<Item>,
vals_reader: DynamicFastFieldReader<Item>,
}

impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
pub(crate) fn open(
idx_reader: DynamicFastFieldReader<u64>,
vals_reader: BitpackedFastFieldReader<Item>,
vals_reader: DynamicFastFieldReader<Item>,
) -> MultiValuedFastFieldReader<Item> {
MultiValuedFastFieldReader {
idx_reader,
Expand All @@ -32,18 +30,20 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {

/// Returns `(start, stop)`, such that the values associated
/// to the given document are `start..stop`.
#[inline]
fn range(&self, doc: DocId) -> Range<u64> {
let start = self.idx_reader.get(doc);
let stop = self.idx_reader.get(doc + 1);
start..stop
}

/// Returns the array of values associated to the given `doc`.
#[inline]
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
let range = self.range(doc);
let len = (range.end - range.start) as usize;
vals.resize(len, Item::make_zero());
self.vals_reader.get_range_u64(range.start, &mut vals[..]);
self.vals_reader.get_range(range.start, &mut vals[..]);
}

/// Returns the minimum value for this fast field.
Expand All @@ -65,12 +65,14 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
}

/// Returns the number of values associated with the document `DocId`.
#[inline]
pub fn num_vals(&self, doc: DocId) -> usize {
let range = self.range(doc);
(range.end - range.start) as usize
}

/// Returns the overall number of values in this field .
#[inline]
pub fn total_num_vals(&self) -> u64 {
self.idx_reader.max_value()
}
Expand Down
8 changes: 4 additions & 4 deletions src/fastfield/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ pub trait FastFieldReader<Item: FastValue>: Clone {
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
fn get_range(&self, start: DocId, output: &mut [Item]);
fn get_range(&self, start: u64, output: &mut [Item]);

/// Returns the minimum value for this fast field.
///
Expand Down Expand Up @@ -120,7 +120,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
Self::MultiLinearInterpol(reader) => reader.get(doc),
}
}
fn get_range(&self, start: DocId, output: &mut [Item]) {
fn get_range(&self, start: u64, output: &mut [Item]) {
match self {
Self::Bitpacked(reader) => reader.get_range(start, output),
Self::LinearInterpol(reader) => reader.get_range(start, output),
Expand Down Expand Up @@ -226,8 +226,8 @@ impl<Item: FastValue, C: FastFieldCodecReader + Clone> FastFieldReader<Item>
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
fn get_range(&self, start: DocId, output: &mut [Item]) {
self.get_range_u64(u64::from(start), output);
fn get_range(&self, start: u64, output: &mut [Item]) {
self.get_range_u64(start, output);
}

/// Returns the minimum value for this fast field.
Expand Down
15 changes: 10 additions & 5 deletions src/fastfield/readers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,22 +99,27 @@ impl FastFieldReaders {
Ok(())
}

pub(crate) fn typed_fast_field_reader<TFastValue: FastValue>(
pub(crate) fn typed_fast_field_reader_with_idx<TFastValue: FastValue>(
&self,
field: Field,
index: usize,
) -> crate::Result<DynamicFastFieldReader<TFastValue>> {
let fast_field_slice = self.fast_field_data(field, 0)?;
let fast_field_slice = self.fast_field_data(field, index)?;
DynamicFastFieldReader::open(fast_field_slice)
}
pub(crate) fn typed_fast_field_reader<TFastValue: FastValue>(
&self,
field: Field,
) -> crate::Result<DynamicFastFieldReader<TFastValue>> {
self.typed_fast_field_reader_with_idx(field, 0)
}

pub(crate) fn typed_fast_field_multi_reader<TFastValue: FastValue>(
&self,
field: Field,
) -> crate::Result<MultiValuedFastFieldReader<TFastValue>> {
let idx_reader = self.typed_fast_field_reader(field)?;
let fast_field_slice_vals = self.fast_field_data(field, 1)?;
let vals_reader: BitpackedFastFieldReader<TFastValue> =
BitpackedFastFieldReader::open(fast_field_slice_vals)?;
let vals_reader = self.typed_fast_field_reader_with_idx(field, 1)?;
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
}

Expand Down
21 changes: 20 additions & 1 deletion src/fastfield/serializer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,26 @@ impl CompositeFastFieldSerializer {
data_iter_1: impl Iterator<Item = u64>,
data_iter_2: impl Iterator<Item = u64>,
) -> io::Result<()> {
let field_write = self.composite_write.for_field_with_idx(field, 0);
self.create_auto_detect_u64_fast_field_with_idx(
field,
stats,
fastfield_accessor,
data_iter_1,
data_iter_2,
0,
)
}
/// Serialize data into a new u64 fast field. The best compression codec will be chosen automatically.
pub fn create_auto_detect_u64_fast_field_with_idx(
&mut self,
field: Field,
stats: FastFieldStats,
fastfield_accessor: impl FastFieldDataAccess,
data_iter_1: impl Iterator<Item = u64>,
data_iter_2: impl Iterator<Item = u64>,
idx: usize,
) -> io::Result<()> {
let field_write = self.composite_write.for_field_with_idx(field, idx);

let mut estimations = vec![];

Expand Down
12 changes: 6 additions & 6 deletions src/fastfield/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema};
use crate::termdict::TermOrdinal;
use crate::DocId;
use fnv::FnvHashMap;
use std::collections::HashMap;
use std::io;
Expand Down Expand Up @@ -323,16 +322,17 @@ struct WriterFastFieldAccessProvider<'map, 'bitp> {
vals: &'bitp BlockedBitpacker,
}
impl<'map, 'bitp> FastFieldDataAccess for WriterFastFieldAccessProvider<'map, 'bitp> {
/// Return the value associated to the given document.
/// Return the value associated to the given doc.
///
/// This accessor should return as fast as possible.
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance reasons.
///
/// # Panics
///
/// May panic if `doc` is greater than the segment
fn get(&self, doc: DocId) -> u64 {
/// May panic if `doc` is greater than the index.
fn get_val(&self, doc: u64) -> u64 {
if let Some(doc_id_map) = self.doc_id_map {
self.vals.get(doc_id_map.get_old_doc_id(doc) as usize) // consider extra FastFieldReader wrapper for non doc_id_map
self.vals
.get(doc_id_map.get_old_doc_id(doc as u32) as usize) // consider extra FastFieldReader wrapper for non doc_id_map
} else {
self.vals.get(doc as usize)
}
Expand Down
Loading

0 comments on commit aea2e77

Please sign in to comment.