Skip to content

Commit

Permalink
Optimize block wand for one and several TermScorer. (#1190)
Browse files Browse the repository at this point in the history
* Added optimisation using block wand for single TermScorer.

A proptest was also added.

* Fix block wand algorithm by taking the last doc id of scores until the pivot scorer (included).
* In block wand, when block max score is lower than the threshold, advance the scorer with best score.
* Fix wrong condition in block_wand_single_scorer and add debug_assert to have an equality check on doc to break the loop.
  • Loading branch information
fmassot authored Nov 1, 2021
1 parent 5916ced commit 0462754
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 7 deletions.
94 changes: 88 additions & 6 deletions src/query/boolean_query/block_wand.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,27 +42,39 @@ fn find_pivot_doc(
Some((before_pivot_len, pivot_len, pivot_doc))
}

// Before and after calling this method, scorers need to be sorted by their `.doc()`.
/// Advance the scorer with best score among the scorers[..pivot_len] to
/// the next doc candidate defined by the min of `last_doc_in_block + 1` for
/// scorer in scorers[..pivot_len] and `scorer.doc()` for scorer in scorers[pivot_len..].
/// Note: before and after calling this method, scorers need to be sorted by their `.doc()`.
fn block_max_was_too_low_advance_one_scorer(
scorers: &mut Vec<TermScorerWithMaxScore>,
pivot_len: usize,
) {
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
let mut scorer_to_seek = pivot_len - 1;
let mut doc_to_seek_after = scorers[scorer_to_seek].doc();
let mut global_max_score = scorers[scorer_to_seek].max_score;
let mut doc_to_seek_after = scorers[scorer_to_seek].last_doc_in_block();
for scorer_ord in (0..pivot_len - 1).rev() {
let scorer = &scorers[scorer_ord];
if scorer.last_doc_in_block() <= doc_to_seek_after {
doc_to_seek_after = scorer.last_doc_in_block();
}
if scorers[scorer_ord].max_score > global_max_score {
global_max_score = scorers[scorer_ord].max_score;
scorer_to_seek = scorer_ord;
}
}
// Add +1 to go to the next block unless we are already at the end.
if doc_to_seek_after != TERMINATED {
doc_to_seek_after += 1;
}
for scorer in &scorers[pivot_len..] {
if scorer.doc() <= doc_to_seek_after {
doc_to_seek_after = scorer.doc();
}
}
scorers[scorer_to_seek].seek(doc_to_seek_after + 1);
scorers[scorer_to_seek].seek(doc_to_seek_after);

restore_ordering(scorers, scorer_to_seek);
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
}
Expand Down Expand Up @@ -130,6 +142,9 @@ fn advance_all_scorers_on_pivot(term_scorers: &mut Vec<TermScorerWithMaxScore>,
term_scorers.sort_by_key(|scorer| scorer.doc());
}

/// Implements the WAND (Weak AND) algorithm for dynamic pruning
/// described in the paper "Faster Top-k Document Retrieval Using Block-Max Indexes".
/// Link: http://engineering.nyu.edu/~suel/papers/bmw.pdf
pub fn block_wand(
mut scorers: Vec<TermScorer>,
mut threshold: Score,
Expand Down Expand Up @@ -187,6 +202,7 @@ pub fn block_wand(
.iter_mut()
.map(|scorer| scorer.score())
.sum();

if score > threshold {
threshold = callback(pivot_doc, score);
}
Expand All @@ -195,6 +211,56 @@ pub fn block_wand(
}
}

/// Specialized version of [`block_wand`] for a single scorer.
/// In this case, the algorithm is simple and readable and faster (~ x3)
/// than the generic algorithm.
/// The algorithm behaves as follows:
/// - While we don't hit the end of the docset:
/// - While the block max score is under the `threshold`, go to the
/// next block.
/// - On a block, advance until the end and execute `callback``
/// when the doc score is greater or equal to the `threshold`.
pub fn block_wand_single_scorer(
mut scorer: TermScorer,
mut threshold: Score,
callback: &mut dyn FnMut(u32, Score) -> Score,
) {
let mut doc = scorer.doc();
loop {
// We position the scorer on a block that can reach
// the threshold.
while scorer.block_max_score() < threshold {
let last_doc_in_block = scorer.last_doc_in_block();
if last_doc_in_block == TERMINATED {
return;
}
doc = last_doc_in_block + 1;
scorer.shallow_seek(doc);
}
// Seek will effectively load that block.
doc = scorer.seek(doc);
if doc == TERMINATED {
break;
}
loop {
let score = scorer.score();
if score > threshold {
threshold = callback(doc, score);
}
debug_assert!(doc <= scorer.last_doc_in_block());
if doc == scorer.last_doc_in_block() {
break;
}
doc = scorer.advance();
if doc == TERMINATED {
return;
}
}
doc += 1;
scorer.shallow_seek(doc);
}
}

struct TermScorerWithMaxScore<'a> {
scorer: &'a mut TermScorer,
max_score: Score,
Expand Down Expand Up @@ -272,13 +338,14 @@ mod tests {
}

fn compute_checkpoints_for_each_pruning(
term_scorers: Vec<TermScorer>,
mut term_scorers: Vec<TermScorer>,
n: usize,
) -> Vec<(DocId, Score)> {
let mut heap: BinaryHeap<Float> = BinaryHeap::with_capacity(n);
let mut checkpoints: Vec<(DocId, Score)> = Vec::new();
let mut limit: Score = 0.0;
super::block_wand(term_scorers, Score::MIN, &mut |doc, score| {

let callback = &mut |doc, score| {
heap.push(Float(score));
if heap.len() > n {
heap.pop().unwrap();
Expand All @@ -290,7 +357,14 @@ mod tests {
checkpoints.push((doc, score));
}
limit
});
};

if term_scorers.len() == 1 {
let scorer = term_scorers.pop().unwrap();
super::block_wand_single_scorer(scorer, Score::MIN, callback);
} else {
super::block_wand(term_scorers, Score::MIN, callback);
}
checkpoints
}

Expand Down Expand Up @@ -424,6 +498,14 @@ mod tests {
}
}

proptest! {
#![proptest_config(ProptestConfig::with_cases(500))]
#[test]
fn test_block_wand_single_term_scorer((posting_lists, fieldnorms) in gen_term_scorers(1)) {
test_block_wand_aux(&posting_lists[..], &fieldnorms[..]);
}
}

#[test]
fn test_fn_reproduce_proptest() {
let postings_lists = &[
Expand Down
1 change: 1 addition & 0 deletions src/query/boolean_query/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ mod boolean_query;
mod boolean_weight;

pub(crate) use self::block_wand::block_wand;
pub(crate) use self::block_wand::block_wand_single_scorer;
pub use self::boolean_query::BooleanQuery;

#[cfg(test)]
Expand Down
2 changes: 1 addition & 1 deletion src/query/term_query/term_weight.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ impl Weight for TermWeight {
callback: &mut dyn FnMut(DocId, Score) -> Score,
) -> crate::Result<()> {
let scorer = self.specialized_scorer(reader, 1.0)?;
crate::query::boolean_query::block_wand(vec![scorer], threshold, callback);
crate::query::boolean_query::block_wand_single_scorer(scorer, threshold, callback);
Ok(())
}
}
Expand Down

0 comments on commit 0462754

Please sign in to comment.