Skip to content

Commit

Permalink
Optimize ConjunctionDISI.createConjunction (#12328)
Browse files Browse the repository at this point in the history
This method is showing up as a little hot when profiling some queries.
Almost all the time spent in this method is just burnt on ceremony
around stream indirections that don't inline.
Moving this to iterators, simplifying the check for same doc id and also saving one iteration (for the min
cost) makes this method far cheaper and easier to read.
  • Loading branch information
original-brownbear authored and javanna committed May 26, 2023
1 parent e0ac15a commit b568492
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 26 deletions.
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ Optimizations

* GITHUB#12235: Optimize HNSW diversity calculation. (Patrick Zhai)

* GITHUB#12328: Optimize ConjunctionDISI.createConjunction (Armin Braun)

Bug Fixes
---------------------

Expand Down
45 changes: 19 additions & 26 deletions lucene/core/src/java/org/apache/lucene/search/ConjunctionDISI.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BitSet;
Expand Down Expand Up @@ -99,20 +98,22 @@ static DocIdSetIterator createConjunction(
allIterators.size() > 0
? allIterators.get(0).docID()
: twoPhaseIterators.get(0).approximation.docID();
boolean iteratorsOnTheSameDoc = allIterators.stream().allMatch(it -> it.docID() == curDoc);
iteratorsOnTheSameDoc =
iteratorsOnTheSameDoc
&& twoPhaseIterators.stream().allMatch(it -> it.approximation().docID() == curDoc);
if (iteratorsOnTheSameDoc == false) {
throw new IllegalArgumentException(
"Sub-iterators of ConjunctionDISI are not on the same document!");
long minCost = Long.MAX_VALUE;
for (DocIdSetIterator allIterator : allIterators) {
if (allIterator.docID() != curDoc) {
throwSubIteratorsNotOnSameDocument();
}
minCost = Math.min(allIterator.cost(), minCost);
}
for (TwoPhaseIterator it : twoPhaseIterators) {
if (it.approximation().docID() != curDoc) {
throwSubIteratorsNotOnSameDocument();
}
}

long minCost = allIterators.stream().mapToLong(DocIdSetIterator::cost).min().getAsLong();
List<BitSetIterator> bitSetIterators = new ArrayList<>();
List<DocIdSetIterator> iterators = new ArrayList<>();
for (DocIdSetIterator iterator : allIterators) {
if (iterator.cost() > minCost && iterator instanceof BitSetIterator) {
if (iterator instanceof BitSetIterator && iterator.cost() > minCost) {
// we put all bitset iterators into bitSetIterators
// except if they have the minimum cost, since we need
// them to lead the iteration in that case
Expand Down Expand Up @@ -142,6 +143,11 @@ static DocIdSetIterator createConjunction(
return disi;
}

private static void throwSubIteratorsNotOnSameDocument() {
throw new IllegalArgumentException(
"Sub-iterators of ConjunctionDISI are not on the same document!");
}

final DocIdSetIterator lead1, lead2;
final DocIdSetIterator[] others;

Expand All @@ -150,14 +156,7 @@ private ConjunctionDISI(List<? extends DocIdSetIterator> iterators) {

// Sort the array the first time to allow the least frequent DocsEnum to
// lead the matching.
CollectionUtil.timSort(
iterators,
new Comparator<DocIdSetIterator>() {
@Override
public int compare(DocIdSetIterator o1, DocIdSetIterator o2) {
return Long.compare(o1.cost(), o2.cost());
}
});
CollectionUtil.timSort(iterators, (o1, o2) -> Long.compare(o1.cost(), o2.cost()));
lead1 = iterators.get(0);
lead2 = iterators.get(1);
others = iterators.subList(2, iterators.size()).toArray(new DocIdSetIterator[0]);
Expand Down Expand Up @@ -326,13 +325,7 @@ private ConjunctionTwoPhaseIterator(
assert twoPhaseIterators.size() > 0;

CollectionUtil.timSort(
twoPhaseIterators,
new Comparator<TwoPhaseIterator>() {
@Override
public int compare(TwoPhaseIterator o1, TwoPhaseIterator o2) {
return Float.compare(o1.matchCost(), o2.matchCost());
}
});
twoPhaseIterators, (o1, o2) -> Float.compare(o1.matchCost(), o2.matchCost()));

this.twoPhaseIterators =
twoPhaseIterators.toArray(new TwoPhaseIterator[twoPhaseIterators.size()]);
Expand Down

0 comments on commit b568492

Please sign in to comment.