From 72a628622529e2b15315b34c29fc896c74964698 Mon Sep 17 00:00:00 2001 From: anton Date: Sun, 25 Feb 2024 22:44:09 +0100 Subject: [PATCH 1/8] Made DocIdsWriter use DISI when reading documents with an IntersectVisitor. Instead of calling IntersectVisitor.visit for each doc in the readDelta16 and readInts32 methods, create a DocIdSetIterator and call IntersectVisitor.visit(DocIdSetIterator) instead. This seems to make Lucene faster at sorting and range querying tasks - the hypothesis being that it is due to fewer virtual calls. --- .../apache/lucene/util/bkd/DocIdsWriter.java | 69 +++++++++++++++++-- 1 file changed, 63 insertions(+), 6 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java index 69818b337f21..d8e03f54d527 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java @@ -36,6 +36,65 @@ final class DocIdsWriter { private final int[] scratch; + private final ScratchDocIdSetIterator scratchDocIdSetIterator = new ScratchDocIdSetIterator(); + + /** + DocIdSetIterator to be used to iterate over the scratch buffer. A single instance is reused to avoid + re-allocating the object. The reset method should be called before each use with the count. + + The main reason for existing is to be able to call the {@link IntersectVisitor#visit(DocIdSetIterator)} method + rather than the {@link IntersectVisitor#visit(int)} method. This seems to make a difference in performance, + probably due to fewer virtual calls then happening (once per read call rather than once per doc). + + */ + private class ScratchDocIdSetIterator extends DocIdSetIterator { + + private int index = -1; + private int count = -1; + + @Override + public int docID() { + if (index < 0) { + return -1; + } + if (index >= count) { + return NO_MORE_DOCS; + } + return scratch[index]; + } + + @Override + public int nextDoc() throws IOException { + index++; + if (index >= count) { + return NO_MORE_DOCS; + } + return scratch[index]; + } + + @Override + public int advance(int target) throws IOException { + while (index < count && scratch[index] < target) { + index++; + } + if (index >= count) { + return NO_MORE_DOCS; + } else { + return scratch[index]; + } + } + + @Override + public long cost() { + return count; + } + + void reset(int count) { + this.count = count; + this.index = -1; + } + } + DocIdsWriter(int maxPointsInLeaf) { scratch = new int[maxPointsInLeaf]; } @@ -318,9 +377,8 @@ private static void readLegacyDeltaVInts(IndexInput in, int count, IntersectVisi private void readDelta16(IndexInput in, int count, IntersectVisitor visitor) throws IOException { readDelta16(in, count, scratch); - for (int i = 0; i < count; i++) { - visitor.visit(scratch[i]); - } + scratchDocIdSetIterator.reset(count); + visitor.visit(scratchDocIdSetIterator); } private static void readInts24(IndexInput in, int count, IntersectVisitor visitor) @@ -346,8 +404,7 @@ private static void readInts24(IndexInput in, int count, IntersectVisitor visito private void readInts32(IndexInput in, int count, IntersectVisitor visitor) throws IOException { in.readInts(scratch, 0, count); - for (int i = 0; i < count; i++) { - visitor.visit(scratch[i]); - } + scratchDocIdSetIterator.reset(count); + visitor.visit(scratchDocIdSetIterator); } } From 8287e855259c4658f20a3eb60e97bff0bb147370 Mon Sep 17 00:00:00 2001 From: anton Date: Fri, 1 Mar 2024 11:24:19 +0100 Subject: [PATCH 2/8] Spotless --- .../org/apache/lucene/util/bkd/DocIdsWriter.java | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java index d8e03f54d527..2e95812475c5 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java @@ -39,13 +39,14 @@ final class DocIdsWriter { private final ScratchDocIdSetIterator scratchDocIdSetIterator = new ScratchDocIdSetIterator(); /** - DocIdSetIterator to be used to iterate over the scratch buffer. A single instance is reused to avoid - re-allocating the object. The reset method should be called before each use with the count. - - The main reason for existing is to be able to call the {@link IntersectVisitor#visit(DocIdSetIterator)} method - rather than the {@link IntersectVisitor#visit(int)} method. This seems to make a difference in performance, - probably due to fewer virtual calls then happening (once per read call rather than once per doc). - + * DocIdSetIterator to be used to iterate over the scratch buffer. A single instance is reused to + * avoid re-allocating the object. The reset method should be called before each use with the + * count. + * + *

The main reason for existing is to be able to call the {@link + * IntersectVisitor#visit(DocIdSetIterator)} method rather than the {@link + * IntersectVisitor#visit(int)} method. This seems to make a difference in performance, probably + * due to fewer virtual calls then happening (once per read call rather than once per doc). */ private class ScratchDocIdSetIterator extends DocIdSetIterator { From 60495d40b93f7d24dd96c2545f32d38bde3f2466 Mon Sep 17 00:00:00 2001 From: anton Date: Fri, 15 Mar 2024 11:20:25 +0100 Subject: [PATCH 3/8] Changed bulk iteration to use IntsRef instead. --- .../org/apache/lucene/index/PointValues.java | 12 +++ .../apache/lucene/search/PointRangeQuery.java | 16 ++++ .../apache/lucene/util/bkd/DocIdsWriter.java | 73 ++++--------------- 3 files changed, 44 insertions(+), 57 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/PointValues.java b/lucene/core/src/java/org/apache/lucene/index/PointValues.java index 2bc2173560bd..5b300335360a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/PointValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/PointValues.java @@ -31,6 +31,7 @@ import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil.ByteArrayComparator; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.bkd.BKDConfig; /** @@ -298,6 +299,17 @@ default void visit(DocIdSetIterator iterator) throws IOException { } } + /** + * Similar to {@link IntersectVisitor#visit(int)}, but a bulk visit and implements may have + * their optimizations. Even if the implementation does the same thing, this can be a speed + * improvement due to fewer virtual calls. + */ + default void visit(IntsRef ref) throws IOException { + for (int i = ref.offset; i < ref.length + ref.offset; i++) { + visit(ref.ints[i]); + } + } + /** * Called for all documents in a leaf cell that crosses the query. The consumer should * scrutinize the packedValue to decide whether to accept it. In the 1D case, values are visited diff --git a/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java b/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java index 64fb00bdb347..3fcd0fef4d0c 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java @@ -32,6 +32,7 @@ import org.apache.lucene.util.BitSetIterator; import org.apache.lucene.util.DocIdSetBuilder; import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IntsRef; /** * Abstract class for range queries against single or multidimensional points such as {@link @@ -185,6 +186,13 @@ public void visit(DocIdSetIterator iterator) throws IOException { adder.add(iterator); } + @Override + public void visit(IntsRef ref) { + for (int i = ref.offset; i < ref.offset + ref.length; i++) { + adder.add(ref.ints[i]); + } + } + @Override public void visit(int docID, byte[] packedValue) { if (matches(packedValue)) { @@ -222,6 +230,14 @@ public void visit(DocIdSetIterator iterator) throws IOException { cost[0] = Math.max(0, cost[0] - iterator.cost()); } + @Override + public void visit(IntsRef ref) throws IOException { + for (int i = ref.offset; i < ref.offset + ref.length; i++) { + result.clear(ref.ints[i]); + cost[0]--; + } + } + @Override public void visit(int docID, byte[] packedValue) { if (matches(packedValue) == false) { diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java index 2e95812475c5..9f6a10b9ddcf 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java @@ -23,6 +23,7 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.DocBaseBitSetIterator; import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IntsRef; final class DocIdsWriter { @@ -36,64 +37,20 @@ final class DocIdsWriter { private final int[] scratch; - private final ScratchDocIdSetIterator scratchDocIdSetIterator = new ScratchDocIdSetIterator(); - /** - * DocIdSetIterator to be used to iterate over the scratch buffer. A single instance is reused to - * avoid re-allocating the object. The reset method should be called before each use with the - * count. + * IntsRef to be used to iterate over the scratch buffer. A single instance is reused to avoid + * re-allocating the object. The ints and length fields need to be reset each use. * *

The main reason for existing is to be able to call the {@link - * IntersectVisitor#visit(DocIdSetIterator)} method rather than the {@link - * IntersectVisitor#visit(int)} method. This seems to make a difference in performance, probably - * due to fewer virtual calls then happening (once per read call rather than once per doc). + * IntersectVisitor#visit(IntsRef)} method rather than the {@link IntersectVisitor#visit(int)} + * method. This seems to make a difference in performance, probably due to fewer virtual calls + * then happening (once per read call rather than once per doc). */ - private class ScratchDocIdSetIterator extends DocIdSetIterator { - - private int index = -1; - private int count = -1; - - @Override - public int docID() { - if (index < 0) { - return -1; - } - if (index >= count) { - return NO_MORE_DOCS; - } - return scratch[index]; - } - - @Override - public int nextDoc() throws IOException { - index++; - if (index >= count) { - return NO_MORE_DOCS; - } - return scratch[index]; - } + private final IntsRef scratchIntsRef = new IntsRef(); - @Override - public int advance(int target) throws IOException { - while (index < count && scratch[index] < target) { - index++; - } - if (index >= count) { - return NO_MORE_DOCS; - } else { - return scratch[index]; - } - } - - @Override - public long cost() { - return count; - } - - void reset(int count) { - this.count = count; - this.index = -1; - } + { + // This is here to not rely on the default constructor of IntsRef to set offset to 0 + scratchIntsRef.offset = 0; } DocIdsWriter(int maxPointsInLeaf) { @@ -378,8 +335,9 @@ private static void readLegacyDeltaVInts(IndexInput in, int count, IntersectVisi private void readDelta16(IndexInput in, int count, IntersectVisitor visitor) throws IOException { readDelta16(in, count, scratch); - scratchDocIdSetIterator.reset(count); - visitor.visit(scratchDocIdSetIterator); + scratchIntsRef.ints = scratch; + scratchIntsRef.length = count; + visitor.visit(scratchIntsRef); } private static void readInts24(IndexInput in, int count, IntersectVisitor visitor) @@ -405,7 +363,8 @@ private static void readInts24(IndexInput in, int count, IntersectVisitor visito private void readInts32(IndexInput in, int count, IntersectVisitor visitor) throws IOException { in.readInts(scratch, 0, count); - scratchDocIdSetIterator.reset(count); - visitor.visit(scratchDocIdSetIterator); + scratchIntsRef.ints = scratch; + scratchIntsRef.length = count; + visitor.visit(scratchIntsRef); } } From e826696fd235cb1ef215745944cac6e34faa0d20 Mon Sep 17 00:00:00 2001 From: anton Date: Fri, 15 Mar 2024 11:33:30 +0100 Subject: [PATCH 4/8] Clearer comment. --- lucene/core/src/java/org/apache/lucene/index/PointValues.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/PointValues.java b/lucene/core/src/java/org/apache/lucene/index/PointValues.java index 5b300335360a..6e6bb0bb3f9f 100644 --- a/lucene/core/src/java/org/apache/lucene/index/PointValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/PointValues.java @@ -301,8 +301,8 @@ default void visit(DocIdSetIterator iterator) throws IOException { /** * Similar to {@link IntersectVisitor#visit(int)}, but a bulk visit and implements may have - * their optimizations. Even if the implementation does the same thing, this can be a speed - * improvement due to fewer virtual calls. + * their optimizations. Even if the implementation does the same thing this method, this may be + * a speed improvement due to fewer virtual calls. */ default void visit(IntsRef ref) throws IOException { for (int i = ref.offset; i < ref.length + ref.offset; i++) { From 2b6328b7dd2be5cb5f7bd3d768f3dbf446778123 Mon Sep 17 00:00:00 2001 From: anton Date: Tue, 26 Mar 2024 23:17:37 +0100 Subject: [PATCH 5/8] Decrease cost outside loop --- .../src/java/org/apache/lucene/search/PointRangeQuery.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java b/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java index 3fcd0fef4d0c..ad5f7bd9a049 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java @@ -231,11 +231,11 @@ public void visit(DocIdSetIterator iterator) throws IOException { } @Override - public void visit(IntsRef ref) throws IOException { + public void visit(IntsRef ref) { for (int i = ref.offset; i < ref.offset + ref.length; i++) { result.clear(ref.ints[i]); - cost[0]--; } + cost[0] -= ref.length; } @Override From cc39cc1afb32a49b83a9557a31835d93ce152ef8 Mon Sep 17 00:00:00 2001 From: anton Date: Sat, 30 Mar 2024 23:34:15 +0100 Subject: [PATCH 6/8] Added test to make inverse intsref be used. --- .../org/apache/lucene/search/TestPointQueries.java | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java index 93646bfafa78..0ca2d177b09f 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java @@ -374,6 +374,10 @@ public void testRandomLongsMedium() throws Exception { doTestRandomLongs(1000); } + public void testRandomLongsBig() throws Exception { + doTestRandomLongs(20_000); + } + private void doTestRandomLongs(int count) throws Exception { int numValues = TestUtil.nextInt(random(), count, count * 2); @@ -434,7 +438,13 @@ private static void verifyLongs(long[] values, int[] ids) throws Exception { dir = newMaybeVirusCheckingDirectory(); } - int missingPct = random().nextInt(100); + /* + The point range query chooses only considers using an inverse BKD visitor if + there is exactly one value per document. If any document misses a value that + code is not exercised. Using a nextBoolean() here increases the likelihood + that there is no missing values, making the test more likely to test that code. + */ + int missingPct = random().nextBoolean() ? 0 : random().nextInt(100); int deletedPct = random().nextInt(100); if (VERBOSE) { System.out.println(" missingPct=" + missingPct); From 95e3bcc03ceba677bac1547218a109d0253d7a44 Mon Sep 17 00:00:00 2001 From: anton Date: Mon, 1 Apr 2024 18:19:34 +0200 Subject: [PATCH 7/8] Wording improvement in javadoc. --- lucene/core/src/java/org/apache/lucene/index/PointValues.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/PointValues.java b/lucene/core/src/java/org/apache/lucene/index/PointValues.java index 6e6bb0bb3f9f..14809d39bf4d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/PointValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/PointValues.java @@ -289,7 +289,7 @@ public interface IntersectVisitor { void visit(int docID) throws IOException; /** - * Similar to {@link IntersectVisitor#visit(int)}, but a bulk visit and implements may have + * Similar to {@link IntersectVisitor#visit(int)}, but a bulk visit and implementations may have * their optimizations. */ default void visit(DocIdSetIterator iterator) throws IOException { From 288ab03640573b131686521c1c59942461b09807 Mon Sep 17 00:00:00 2001 From: anton Date: Mon, 1 Apr 2024 20:41:15 +0200 Subject: [PATCH 8/8] Added CHANGES.txt entry. --- lucene/CHANGES.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index c11c77528f3e..e739dbcbe3f2 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -134,6 +134,9 @@ Optimizations * GITHUB#12552: Make FSTPostingsFormat load FSTs off-heap. (Tony X) +* GITHUB#13149: Made PointRangeQuery faster, for some segment sizes, by reducing the amount of virtual calls to + IntersectVisitor::visit(int). (Anton Hägerstrand) + Bug Fixes ---------------------