Skip to content

Commit

Permalink
CR comments
Browse files Browse the repository at this point in the history
- Using Option for fieldnorm readers.
- Fixing the ratio computer. max_doc and not num_doc.
  • Loading branch information
fulmicoton committed Dec 10, 2021
1 parent 798e60f commit 37b6d6a
Show file tree
Hide file tree
Showing 10 changed files with 324 additions and 91 deletions.
6 changes: 5 additions & 1 deletion src/core/segment_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -127,13 +127,17 @@ impl SegmentReader {
self.fieldnorm_readers.get_field(field)?.ok_or_else(|| {
let field_name = self.schema.get_field_name(field);
let err_msg = format!(
"Field norm not found for field {:?}. Was it marked as normed during indexing?",
"Field norm not found for field {:?}. Was the field set to record norm during indexing?",
field_name
);
crate::TantivyError::SchemaError(err_msg)
})
}

pub(crate) fn fieldnorms_readers(&self) -> &FieldNormReaders {
&self.fieldnorm_readers
}

/// Accessor to the segment's `StoreReader`.
pub fn get_store_reader(&self) -> io::Result<StoreReader> {
StoreReader::open(self.store_file.clone())
Expand Down
5 changes: 0 additions & 5 deletions src/fieldnorm/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,4 @@ mod tests {
}
Ok(())
}

#[test]
pub fn test_retrieve_fields_with_norms() {
//TODO
}
}
78 changes: 47 additions & 31 deletions src/fieldnorm/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use super::fieldnorm_to_id;
use super::FieldNormsSerializer;
use crate::schema::Field;
use crate::schema::Schema;
use std::cmp::Ordering;
use std::{io, iter};

/// The `FieldNormsWriter` is in charge of tracking the fieldnorm byte
Expand All @@ -12,8 +13,7 @@ use std::{io, iter};
/// `FieldNormsWriter` stores a Vec<u8> for each tracked field, using a
/// byte per document per field.
pub struct FieldNormsWriter {
fields: Vec<Field>,
fieldnorms_buffer: Vec<Vec<u8>>,
fieldnorms_buffers: Vec<Option<Vec<u8>>>,
}

impl FieldNormsWriter {
Expand All @@ -35,25 +35,20 @@ impl FieldNormsWriter {
/// Initialize with state for tracking the field norm fields
/// specified in the schema.
pub fn for_schema(schema: &Schema) -> FieldNormsWriter {
let fields = FieldNormsWriter::fields_with_fieldnorm(schema);
let max_field = fields
.iter()
.map(Field::field_id)
.max()
.map(|max_field_id| max_field_id as usize + 1)
.unwrap_or(0);
FieldNormsWriter {
fields,
fieldnorms_buffer: iter::repeat_with(Vec::new)
.take(max_field)
.collect::<Vec<_>>(),
let mut fieldnorms_buffers: Vec<Option<Vec<u8>>> = iter::repeat_with(|| None)
.take(schema.num_fields())
.collect();
for field in FieldNormsWriter::fields_with_fieldnorm(schema) {
fieldnorms_buffers[field.field_id() as usize] = Some(Vec::with_capacity(1_000));
}
FieldNormsWriter { fieldnorms_buffers }
}

/// The memory used inclusive childs
pub fn mem_usage(&self) -> usize {
self.fieldnorms_buffer
self.fieldnorms_buffers
.iter()
.flatten()
.map(|buf| buf.capacity())
.sum()
}
Expand All @@ -62,8 +57,10 @@ impl FieldNormsWriter {
///
/// Will extend with 0-bytes for documents that have not been seen.
pub fn fill_up_to_max_doc(&mut self, max_doc: DocId) {
for field in self.fields.iter() {
self.fieldnorms_buffer[field.field_id() as usize].resize(max_doc as usize, 0u8);
for fieldnorms_buffer_opt in self.fieldnorms_buffers.iter_mut() {
if let Some(fieldnorms_buffer) = fieldnorms_buffer_opt.as_mut() {
fieldnorms_buffer.resize(max_doc as usize, 0u8);
}
}
}

Expand All @@ -76,14 +73,24 @@ impl FieldNormsWriter {
/// * field - the field being set
/// * fieldnorm - the number of terms present in document `doc` in field `field`
pub fn record(&mut self, doc: DocId, field: Field, fieldnorm: u32) {
let fieldnorm_buffer: &mut Vec<u8> = &mut self.fieldnorms_buffer[field.field_id() as usize];
assert!(
fieldnorm_buffer.len() <= doc as usize,
"Cannot register a given fieldnorm twice"
);
// we fill intermediary `DocId` as having a fieldnorm of 0.
fieldnorm_buffer.resize(doc as usize + 1, 0u8);
fieldnorm_buffer[doc as usize] = fieldnorm_to_id(fieldnorm);
if let Some(fieldnorm_buffer) = self
.fieldnorms_buffers
.get_mut(field.field_id() as usize)
.map(Option::as_mut)
.flatten()
{
match fieldnorm_buffer.len().cmp(&(doc as usize)) {
Ordering::Less => {
// we fill intermediary `DocId` as having a fieldnorm of 0.
fieldnorm_buffer.resize(doc as usize, 0u8);
}
Ordering::Equal => {}
Ordering::Greater => {
panic!("Cannot register a given fieldnorm twice")
}
}
fieldnorm_buffer.push(fieldnorm_to_id(fieldnorm));
}
}

/// Serialize the seen fieldnorm values to the serializer for all fields.
Expand All @@ -92,17 +99,26 @@ impl FieldNormsWriter {
mut fieldnorms_serializer: FieldNormsSerializer,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
for &field in self.fields.iter() {
let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.field_id() as usize][..];
for (field, fieldnorms_buffer) in self
.fieldnorms_buffers
.iter()
.enumerate()
.map(|(field_id, fieldnorms_buffer_opt)| {
fieldnorms_buffer_opt.as_ref().map(|fieldnorms_buffer| {
(Field::from_field_id(field_id as u32), fieldnorms_buffer)
})
})
.flatten()
{
if let Some(doc_id_map) = doc_id_map {
let mut mapped_fieldnorm_values = vec![];
mapped_fieldnorm_values.resize(fieldnorm_values.len(), 0u8);
// TODO is that a bug?
let mut mapped_fieldnorm_values = vec![0u8; doc_id_map.num_new_doc_ids()];
for (new_doc_id, old_doc_id) in doc_id_map.iter_old_doc_ids().enumerate() {
mapped_fieldnorm_values[new_doc_id] = fieldnorm_values[old_doc_id as usize];
mapped_fieldnorm_values[new_doc_id] = fieldnorms_buffer[old_doc_id as usize];
}
fieldnorms_serializer.serialize_field(field, &mapped_fieldnorm_values)?;
} else {
fieldnorms_serializer.serialize_field(field, fieldnorm_values)?;
fieldnorms_serializer.serialize_field(field, fieldnorms_buffer)?;
}
}
fieldnorms_serializer.close()?;
Expand Down
3 changes: 3 additions & 0 deletions src/indexer/doc_id_mapping.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ impl DocIdMapping {
pub fn iter_old_doc_ids(&self) -> impl Iterator<Item = DocId> + Clone + '_ {
self.new_doc_id_to_old.iter().cloned()
}
pub fn num_new_doc_ids(&self) -> usize {
self.new_doc_id_to_old.len()
}
}

pub(crate) fn expect_field_id_for_sort_field(
Expand Down
79 changes: 45 additions & 34 deletions src/indexer/merger.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,43 +41,56 @@ use tantivy_bitpacker::minmax;
/// We do not allow segments with more than
pub const MAX_DOC_LIMIT: u32 = 1 << 31;

fn estimate_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result<u64> {
let mut total_tokens = 0u64;
let mut count: [usize; 256] = [0; 256];
for reader in readers {
// When there are deletes, we use an approximation either
// - by using the fieldnorm
// - or by using a multiplying the total number of tokens by a ratio (1 - deleted docs / num docs).
if reader.has_deletes() {
if reader
.schema()
.get_field_entry(field)
.field_type()
.has_fieldnorms()
{
let fieldnorms_reader = reader.get_fieldnorms_reader(field)?;
for doc in reader.doc_ids_alive() {
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc);
count[fieldnorm_id as usize] += 1;
}
} else {
let segment_num_tokens = reader.inverted_index(field)?.total_num_tokens();
let ratio = 1f64 - reader.num_deleted_docs() as f64 / reader.num_docs() as f64;
total_tokens += (segment_num_tokens as f64 * ratio) as u64;
}
} else {
total_tokens += reader.inverted_index(field)?.total_num_tokens();
}
fn estimate_total_num_tokens_in_single_segment(
reader: &SegmentReader,
field: Field,
) -> crate::Result<u64> {
// There are no deletes. We can simply use the exact value saved into the posting list.
// Note that this value is not necessarily exact as it could have been the result of a merge between
// segments themselves containing deletes.
if !reader.has_deletes() {
return Ok(reader.inverted_index(field)?.total_num_tokens());
}
Ok(total_tokens
+ count

// When there are deletes, we use an approximation either
// by using the fieldnorm.
if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(field)? {
let mut count: [usize; 256] = [0; 256];
for doc in reader.doc_ids_alive() {
let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
count[fieldnorm_id as usize] += 1;
}
let total_num_tokens = count
.iter()
.cloned()
.enumerate()
.map(|(fieldnorm_ord, count)| {
count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8))
})
.sum::<u64>())
.sum::<u64>();
return Ok(total_num_tokens);
}

// There are no fieldnorms available.
// Here we just do a pro-rata with the overall number of tokens an the ratio of
// documents alive.
let segment_num_tokens = reader.inverted_index(field)?.total_num_tokens();
if reader.num_docs() == 0 {}

if reader.max_doc() == 0 {
// That supposedly never happens, but let's be a bit defensive here.
return Ok(0u64);
}
let ratio = reader.num_docs() as f64 / reader.max_doc() as f64;
Ok((segment_num_tokens as f64 * ratio) as u64)
}

fn estimate_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result<u64> {
let mut total_num_tokens: u64 = 0;
for reader in readers {
total_num_tokens += estimate_total_num_tokens_in_single_segment(reader, field)?;
}
Ok(total_num_tokens)
}

pub struct IndexMerger {
Expand Down Expand Up @@ -863,10 +876,8 @@ impl IndexMerger {
segment_map[*old_doc_id as usize] = Some(new_doc_id as DocId);
}

// The total number of tokens will only be exact when there has been no deletes
// and if the field has a norm.
//
// Otherwise, we approximate by removing deleted documents proportionally.
// Note that the total number of tokens is not exact.
// It is only used as a parameter in the BM25 formula.
let total_num_tokens: u64 = estimate_total_num_tokens(&self.readers, indexed_field)?;

// Create the total list of doc ids
Expand Down
5 changes: 1 addition & 4 deletions src/indexer/segment_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -234,10 +234,7 @@ impl SegmentWriter {
term_buffer,
)
};

if field_entry.has_fieldnorms() {
self.fieldnorms_writer.record(doc_id, field, num_tokens);
}
self.fieldnorms_writer.record(doc_id, field, num_tokens);
}
FieldType::U64(_) => {
for field_value in field_values {
Expand Down
Loading

0 comments on commit 37b6d6a

Please sign in to comment.