Skip to content
This repository has been archived by the owner on Apr 4, 2023. It is now read-only.

Commit

Permalink
Store fuzzy/bucketed positions in word_position_docids database
Browse files Browse the repository at this point in the history
Fixes (when merged into meilisearch): meilisearch/meilisearch#3222
  • Loading branch information
loiclec committed Dec 22, 2022
1 parent a8defb5 commit b307c93
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 8 deletions.
68 changes: 68 additions & 0 deletions milli/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,27 @@ pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosi
(field_id as u32) << 16 | (relative as u32)
}

/// Compute the "bucketed" absolute position from the field id and relative position in the field.
///
/// In a bucketed position, the accuracy of the relative position is reduced exponentially as it gets larger.
pub fn bucketed_absolute_from_relative_position(
field_id: FieldId,
relative: RelativePosition,
) -> Position {
// The first few relative positions are kept intact.
if relative < 16 {
absolute_from_relative_position(field_id, relative)
} else if relative < 24 {
// Relative positions between 16 and 24 all become equal to 24
absolute_from_relative_position(field_id, 24)
} else {
// Then, groups of positions that have the same base-2 logarithm are reduced to
// the same relative position: the smallest power of 2 that is greater than them
let relative = (relative as f64).log2().ceil().exp2() as u16;
absolute_from_relative_position(field_id, relative)
}
}

/// Transform a raw obkv store into a JSON Object.
pub fn obkv_to_json(
displayed_fields: &[FieldId],
Expand Down Expand Up @@ -329,4 +350,51 @@ mod tests {

assert_eq!(&actual, expected);
}

#[test]
fn bucketed_position() {
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 0), @"0");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 1), @"1");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 2), @"2");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 15), @"15");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 16), @"24");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 19), @"24");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 20), @"24");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 21), @"24");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 22), @"24");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 23), @"24");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 24), @"32");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 25), @"32");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 30), @"32");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 40), @"64");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 50), @"64");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 60), @"64");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 70), @"128");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 80), @"128");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 90), @"128");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 100), @"128");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 110), @"128");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 120), @"128");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 130), @"256");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 1000), @"1024");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 2000), @"2048");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 4000), @"4096");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 8000), @"8192");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 9000), @"16384");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 10_000), @"16384");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 65_500), @"65535");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 65_535), @"65535");

insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(1, 0), @"65536");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(1, 1), @"65537");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(1, 20), @"65560");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(1, 1000), @"66560");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(1, 65_535), @"131071");

insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(2, 0), @"131072");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(2, 65_535), @"196607");

insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(65_535, 0), @"4294901760");
insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(65_535, 65_535), @"4294967295");
}
}
18 changes: 12 additions & 6 deletions milli/src/search/criteria/exactness.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ use crate::search::criteria::{
InitialCandidates,
};
use crate::search::query_tree::{Operation, PrimitiveQueryPart};
use crate::{absolute_from_relative_position, FieldId, Result};
use crate::{
absolute_from_relative_position, bucketed_absolute_from_relative_position, FieldId, Result,
};

pub struct Exactness<'t> {
ctx: &'t dyn Context<'t>,
Expand Down Expand Up @@ -285,30 +287,34 @@ fn attribute_start_with_docids(
) -> heed::Result<Vec<RoaringBitmap>> {
let mut attribute_candidates_array = Vec::new();
// start from attribute first position
let mut pos = absolute_from_relative_position(attribute_id, 0);
let mut relative_pos = 0;
for part in query {
use ExactQueryPart::*;
match part {
Synonyms(synonyms) => {
let bucketed_position =
bucketed_absolute_from_relative_position(attribute_id, relative_pos);
let mut synonyms_candidates = RoaringBitmap::new();
for word in synonyms {
let wc = ctx.word_position_docids(word, pos)?;
let wc = ctx.word_position_docids(word, bucketed_position)?;
if let Some(word_candidates) = wc {
synonyms_candidates |= word_candidates;
}
}
attribute_candidates_array.push(synonyms_candidates);
pos += 1;
relative_pos += 1;
}
Phrase(phrase) => {
for word in phrase {
let bucketed_position =
bucketed_absolute_from_relative_position(attribute_id, relative_pos);
if let Some(word) = word {
let wc = ctx.word_position_docids(word, pos)?;
let wc = ctx.word_position_docids(word, bucketed_position)?;
if let Some(word_candidates) = wc {
attribute_candidates_array.push(word_candidates);
}
}
pos += 1;
relative_pos += 1;
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ use super::helpers::{
};
use crate::error::SerializationError;
use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::{DocumentId, Result};
use crate::{
bucketed_absolute_from_relative_position, relative_from_absolute_position, DocumentId, Result,
};

/// Extracts the word positions and the documents ids where this word appear.
///
Expand Down Expand Up @@ -37,9 +39,12 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
let document_id = DocumentId::from_be_bytes(document_id_bytes);

for position in read_u32_ne_bytes(value) {
let (field_id, relative) = relative_from_absolute_position(position);
let bucketed_position = bucketed_absolute_from_relative_position(field_id, relative);

key_buffer.clear();
key_buffer.extend_from_slice(word_bytes);
key_buffer.extend_from_slice(&position.to_be_bytes());
key_buffer.extend_from_slice(&bucketed_position.to_be_bytes());

word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
}
Expand Down

0 comments on commit b307c93

Please sign in to comment.