Skip to content

Commit

Permalink
Use dynamic fastfield codes for multivalues fixes #1093
Browse files Browse the repository at this point in the history
Use dynamic fastfield codes for multivalues fixes (only sorting case covered)
Rename get to get_val due to conflict with Vec
use u64 precision instead of u32 for get_range, to allow use of existing fast field interface interface (actually not sure if it would be better have a different interface)
  • Loading branch information
PSeitz committed Jun 30, 2021
1 parent de92f09 commit 6ba302c
Show file tree
Hide file tree
Showing 9 changed files with 132 additions and 56 deletions.
18 changes: 9 additions & 9 deletions fastfield_codecs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,14 @@ pub trait FastFieldCodecSerializer {

/// FastFieldDataAccess is the trait to access fast field data during serialization and estimation.
pub trait FastFieldDataAccess {
/// Return the value associated to the given document.
/// Return the value associated to the given position.
///
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance reasons.
///
/// # Panics
///
/// May panic if `doc` is greater than the segment
fn get(&self, doc: u32) -> u64;
/// May panic if `position` is greater than the index.
fn get_val(&self, position: u64) -> u64;
}

#[derive(Debug, Clone)]
Expand All @@ -69,20 +69,20 @@ pub struct FastFieldStats {
}

impl<'a> FastFieldDataAccess for &'a [u64] {
fn get(&self, doc: u32) -> u64 {
self[doc as usize]
fn get_val(&self, position: u64) -> u64 {
self[position as usize]
}
}

impl<'a> FastFieldDataAccess for &'a Vec<u64> {
fn get(&self, doc: u32) -> u64 {
self[doc as usize]
fn get_val(&self, position: u64) -> u64 {
self[position as usize]
}
}

impl FastFieldDataAccess for Vec<u64> {
fn get(&self, doc: u32) -> u64 {
self[doc as usize]
fn get_val(&self, position: u64) -> u64 {
self[position as usize]
}
}

Expand Down
10 changes: 5 additions & 5 deletions fastfield_codecs/src/linearinterpol.rs
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,8 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
) -> io::Result<()> {
assert!(stats.min_value <= stats.max_value);

let first_val = fastfield_accessor.get(0);
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
let first_val = fastfield_accessor.get_val(0);
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
let slope = get_slope(first_val, last_val, stats.num_vals);
// calculate offset to ensure all values are positive
let mut offset = 0;
Expand Down Expand Up @@ -191,8 +191,8 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
/// where the local maxima for the deviation of the calculated value are and
/// the offset to shift all values to >=0 is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
let first_val = fastfield_accessor.get(0);
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
let first_val = fastfield_accessor.get_val(0);
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
let slope = get_slope(first_val, last_val, stats.num_vals);

// let's sample at 0%, 5%, 10% .. 95%, 100%
Expand All @@ -205,7 +205,7 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
.iter()
.map(|pos| {
let calculated_value = get_calculated_value(first_val, *pos as u64, slope);
let actual_value = fastfield_accessor.get(*pos as u32);
let actual_value = fastfield_accessor.get_val(*pos as u64);
distance(calculated_value, actual_value)
})
.max()
Expand Down
11 changes: 6 additions & 5 deletions fastfield_codecs/src/multilinearinterpol.rs
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,8 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
) -> io::Result<()> {
assert!(stats.min_value <= stats.max_value);

let first_val = fastfield_accessor.get(0);
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
let first_val = fastfield_accessor.get_val(0);
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);

let mut first_function = Function {
end_pos: stats.num_vals,
Expand Down Expand Up @@ -309,9 +309,10 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
/// where the local maxima are for the deviation of the calculated value and
/// the offset is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
let first_val_in_first_block = fastfield_accessor.get(0);
let first_val_in_first_block = fastfield_accessor.get_val(0);
let last_elem_in_first_chunk = CHUNK_SIZE.min(stats.num_vals);
let last_val_in_first_block = fastfield_accessor.get(last_elem_in_first_chunk as u32 - 1);
let last_val_in_first_block =
fastfield_accessor.get_val(last_elem_in_first_chunk as u64 - 1);
let slope = get_slope(
first_val_in_first_block,
last_val_in_first_block,
Expand All @@ -328,7 +329,7 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
.map(|pos| {
let calculated_value =
get_calculated_value(first_val_in_first_block, *pos as u64, slope);
let actual_value = fastfield_accessor.get(*pos as u32);
let actual_value = fastfield_accessor.get_val(*pos as u64);
distance(calculated_value, actual_value)
})
.max()
Expand Down
14 changes: 8 additions & 6 deletions src/fastfield/multivalued/reader.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
use std::ops::Range;

use crate::fastfield::{
BitpackedFastFieldReader, DynamicFastFieldReader, FastFieldReader, FastValue, MultiValueLength,
};
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue, MultiValueLength};
use crate::DocId;

/// Reader for a multivalued `u64` fast field.
Expand All @@ -16,13 +14,13 @@ use crate::DocId;
#[derive(Clone)]
pub struct MultiValuedFastFieldReader<Item: FastValue> {
idx_reader: DynamicFastFieldReader<u64>,
vals_reader: BitpackedFastFieldReader<Item>,
vals_reader: DynamicFastFieldReader<Item>,
}

impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
pub(crate) fn open(
idx_reader: DynamicFastFieldReader<u64>,
vals_reader: BitpackedFastFieldReader<Item>,
vals_reader: DynamicFastFieldReader<Item>,
) -> MultiValuedFastFieldReader<Item> {
MultiValuedFastFieldReader {
idx_reader,
Expand All @@ -32,18 +30,20 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {

/// Returns `(start, stop)`, such that the values associated
/// to the given document are `start..stop`.
#[inline]
fn range(&self, doc: DocId) -> Range<u64> {
let start = self.idx_reader.get(doc);
let stop = self.idx_reader.get(doc + 1);
start..stop
}

/// Returns the array of values associated to the given `doc`.
#[inline]
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
let range = self.range(doc);
let len = (range.end - range.start) as usize;
vals.resize(len, Item::make_zero());
self.vals_reader.get_range_u64(range.start, &mut vals[..]);
self.vals_reader.get_range(range.start, &mut vals[..]);
}

/// Returns the minimum value for this fast field.
Expand All @@ -65,12 +65,14 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
}

/// Returns the number of values associated with the document `DocId`.
#[inline]
pub fn num_vals(&self, doc: DocId) -> usize {
let range = self.range(doc);
(range.end - range.start) as usize
}

/// Returns the overall number of values in this field .
#[inline]
pub fn total_num_vals(&self) -> u64 {
self.idx_reader.max_value()
}
Expand Down
8 changes: 4 additions & 4 deletions src/fastfield/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ pub trait FastFieldReader<Item: FastValue>: Clone {
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
fn get_range(&self, start: DocId, output: &mut [Item]);
fn get_range(&self, start: u64, output: &mut [Item]);

/// Returns the minimum value for this fast field.
///
Expand Down Expand Up @@ -120,7 +120,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
Self::MultiLinearInterpol(reader) => reader.get(doc),
}
}
fn get_range(&self, start: DocId, output: &mut [Item]) {
fn get_range(&self, start: u64, output: &mut [Item]) {
match self {
Self::Bitpacked(reader) => reader.get_range(start, output),
Self::LinearInterpol(reader) => reader.get_range(start, output),
Expand Down Expand Up @@ -226,8 +226,8 @@ impl<Item: FastValue, C: FastFieldCodecReader + Clone> FastFieldReader<Item>
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
fn get_range(&self, start: DocId, output: &mut [Item]) {
self.get_range_u64(u64::from(start), output);
fn get_range(&self, start: u64, output: &mut [Item]) {
self.get_range_u64(start, output);
}

/// Returns the minimum value for this fast field.
Expand Down
15 changes: 10 additions & 5 deletions src/fastfield/readers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,22 +99,27 @@ impl FastFieldReaders {
Ok(())
}

pub(crate) fn typed_fast_field_reader<TFastValue: FastValue>(
pub(crate) fn typed_fast_field_reader_with_idx<TFastValue: FastValue>(
&self,
field: Field,
index: usize,
) -> crate::Result<DynamicFastFieldReader<TFastValue>> {
let fast_field_slice = self.fast_field_data(field, 0)?;
let fast_field_slice = self.fast_field_data(field, index)?;
DynamicFastFieldReader::open(fast_field_slice)
}
pub(crate) fn typed_fast_field_reader<TFastValue: FastValue>(
&self,
field: Field,
) -> crate::Result<DynamicFastFieldReader<TFastValue>> {
self.typed_fast_field_reader_with_idx(field, 0)
}

pub(crate) fn typed_fast_field_multi_reader<TFastValue: FastValue>(
&self,
field: Field,
) -> crate::Result<MultiValuedFastFieldReader<TFastValue>> {
let idx_reader = self.typed_fast_field_reader(field)?;
let fast_field_slice_vals = self.fast_field_data(field, 1)?;
let vals_reader: BitpackedFastFieldReader<TFastValue> =
BitpackedFastFieldReader::open(fast_field_slice_vals)?;
let vals_reader = self.typed_fast_field_reader_with_idx(field, 1)?;
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
}

Expand Down
21 changes: 20 additions & 1 deletion src/fastfield/serializer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,26 @@ impl CompositeFastFieldSerializer {
data_iter_1: impl Iterator<Item = u64>,
data_iter_2: impl Iterator<Item = u64>,
) -> io::Result<()> {
let field_write = self.composite_write.for_field_with_idx(field, 0);
self.create_auto_detect_u64_fast_field_with_idx(
field,
stats,
fastfield_accessor,
data_iter_1,
data_iter_2,
0,
)
}
/// Serialize data into a new u64 fast field. The best compression codec will be chosen automatically.
pub fn create_auto_detect_u64_fast_field_with_idx(
&mut self,
field: Field,
stats: FastFieldStats,
fastfield_accessor: impl FastFieldDataAccess,
data_iter_1: impl Iterator<Item = u64>,
data_iter_2: impl Iterator<Item = u64>,
idx: usize,
) -> io::Result<()> {
let field_write = self.composite_write.for_field_with_idx(field, idx);

let mut estimations = vec![];

Expand Down
12 changes: 6 additions & 6 deletions src/fastfield/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema};
use crate::termdict::TermOrdinal;
use crate::DocId;
use fnv::FnvHashMap;
use std::collections::HashMap;
use std::io;
Expand Down Expand Up @@ -323,16 +322,17 @@ struct WriterFastFieldAccessProvider<'map, 'bitp> {
vals: &'bitp BlockedBitpacker,
}
impl<'map, 'bitp> FastFieldDataAccess for WriterFastFieldAccessProvider<'map, 'bitp> {
/// Return the value associated to the given document.
/// Return the value associated to the given doc.
///
/// This accessor should return as fast as possible.
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance reasons.
///
/// # Panics
///
/// May panic if `doc` is greater than the segment
fn get(&self, doc: DocId) -> u64 {
/// May panic if `doc` is greater than the index.
fn get_val(&self, doc: u64) -> u64 {
if let Some(doc_id_map) = self.doc_id_map {
self.vals.get(doc_id_map.get_old_doc_id(doc) as usize) // consider extra FastFieldReader wrapper for non doc_id_map
self.vals
.get(doc_id_map.get_old_doc_id(doc as u32) as usize) // consider extra FastFieldReader wrapper for non doc_id_map
} else {
self.vals.get(doc as usize)
}
Expand Down
Loading

0 comments on commit 6ba302c

Please sign in to comment.