From 46634e82c9e139547a27d4134853fb10bcbe41a5 Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Thu, 25 May 2023 10:59:16 +0200 Subject: [PATCH 1/3] Optimize ConjunctionDISI.createConjunction This is showing up as a little hot when profiling some queries (in ES enrich caching). Almost all the time spent in this method is just burnt on ceremony around stream indirections that don't inline. Moving this to iterators, simplifying the check for same doc id and also saving one iteration (for the min cost) makes this method far cheaper and IMO easier to read. --- .../apache/lucene/search/ConjunctionDISI.java | 47 ++++++++----------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/ConjunctionDISI.java b/lucene/core/src/java/org/apache/lucene/search/ConjunctionDISI.java index b70224f1ec50..4fc74b4f0871 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ConjunctionDISI.java +++ b/lucene/core/src/java/org/apache/lucene/search/ConjunctionDISI.java @@ -20,7 +20,6 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Collections; -import java.util.Comparator; import java.util.List; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BitSet; @@ -99,24 +98,26 @@ static DocIdSetIterator createConjunction( allIterators.size() > 0 ? allIterators.get(0).docID() : twoPhaseIterators.get(0).approximation.docID(); - boolean iteratorsOnTheSameDoc = allIterators.stream().allMatch(it -> it.docID() == curDoc); - iteratorsOnTheSameDoc = - iteratorsOnTheSameDoc - && twoPhaseIterators.stream().allMatch(it -> it.approximation().docID() == curDoc); - if (iteratorsOnTheSameDoc == false) { - throw new IllegalArgumentException( - "Sub-iterators of ConjunctionDISI are not on the same document!"); + long minCost = Long.MAX_VALUE; + for (DocIdSetIterator allIterator : allIterators) { + if (allIterator.docID() != curDoc) { + throwSubIteratorsNotOnSameDocument(); + } + minCost = Math.min(allIterator.cost(), minCost); + } + for (TwoPhaseIterator it : twoPhaseIterators) { + if (it.approximation().docID() != curDoc) { + throwSubIteratorsNotOnSameDocument(); + } } - - long minCost = allIterators.stream().mapToLong(DocIdSetIterator::cost).min().getAsLong(); List bitSetIterators = new ArrayList<>(); List iterators = new ArrayList<>(); for (DocIdSetIterator iterator : allIterators) { - if (iterator.cost() > minCost && iterator instanceof BitSetIterator) { + if (iterator instanceof BitSetIterator bitSetIterator && bitSetIterator.cost() > minCost) { // we put all bitset iterators into bitSetIterators // except if they have the minimum cost, since we need // them to lead the iteration in that case - bitSetIterators.add((BitSetIterator) iterator); + bitSetIterators.add(bitSetIterator); } else { iterators.add(iterator); } @@ -142,6 +143,11 @@ static DocIdSetIterator createConjunction( return disi; } + private static void throwSubIteratorsNotOnSameDocument() { + throw new IllegalArgumentException( + "Sub-iterators of ConjunctionDISI are not on the same document!"); + } + final DocIdSetIterator lead1, lead2; final DocIdSetIterator[] others; @@ -150,14 +156,7 @@ private ConjunctionDISI(List iterators) { // Sort the array the first time to allow the least frequent DocsEnum to // lead the matching. - CollectionUtil.timSort( - iterators, - new Comparator() { - @Override - public int compare(DocIdSetIterator o1, DocIdSetIterator o2) { - return Long.compare(o1.cost(), o2.cost()); - } - }); + CollectionUtil.timSort(iterators, (o1, o2) -> Long.compare(o1.cost(), o2.cost())); lead1 = iterators.get(0); lead2 = iterators.get(1); others = iterators.subList(2, iterators.size()).toArray(new DocIdSetIterator[0]); @@ -326,13 +325,7 @@ private ConjunctionTwoPhaseIterator( assert twoPhaseIterators.size() > 0; CollectionUtil.timSort( - twoPhaseIterators, - new Comparator() { - @Override - public int compare(TwoPhaseIterator o1, TwoPhaseIterator o2) { - return Float.compare(o1.matchCost(), o2.matchCost()); - } - }); + twoPhaseIterators, (o1, o2) -> Float.compare(o1.matchCost(), o2.matchCost())); this.twoPhaseIterators = twoPhaseIterators.toArray(new TwoPhaseIterator[twoPhaseIterators.size()]); From 87b20d73684c43112b8f7a90c6be7b3fcce2b6fa Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Thu, 25 May 2023 22:57:38 +0200 Subject: [PATCH 2/3] changes entry --- lucene/CHANGES.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index b9e6b2b19eeb..6ffb15808f66 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -76,6 +76,8 @@ Optimizations * GITHUB#12160: Concurrent rewrite for AbstractKnnVectorQuery. (Kaival Parikh) +* GITHUB#12328: Optimize ConjunctionDISI.createConjunction (Armin Braun) + Bug Fixes --------------------- From 3ce79c14ba6d283342fcc5576fe7b29e0454d9af Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Fri, 26 May 2023 12:06:12 +0200 Subject: [PATCH 3/3] address merge conflict --- lucene/CHANGES.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 67bffbbb7c24..faf85efa31a0 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -76,8 +76,6 @@ Optimizations * GITHUB#11857, GITHUB#11859, GITHUB#11893, GITHUB#11909: Hunspell: improved suggestion performance (Peter Gromov) -* GITHUB#12160: Concurrent rewrite for AbstractKnnVectorQuery. (Kaival Parikh) - * GITHUB#12328: Optimize ConjunctionDISI.createConjunction (Armin Braun) Bug Fixes