Skip to content

Commit

Permalink
Made DocIdsWriter use DISI when reading documents with an IntersectVi…
Browse files Browse the repository at this point in the history
…sitor (#13149)

* Made DocIdsWriter use DISI when reading documents with an IntersectVisitor.

Instead of calling IntersectVisitor.visit for each doc in the
readDelta16 and readInts32 methods, create a DocIdSetIterator
and call IntersectVisitor.visit(DocIdSetIterator) instead.

This seems to make Lucene faster at sorting and range querying
tasks - the hypothesis being that it is due to fewer virtual calls.

* Spotless

* Changed bulk iteration to use IntsRef instead.

* Clearer comment.

* Decrease cost outside loop

* Added test to make inverse intsref be used.

* Wording improvement in javadoc.

* Added CHANGES.txt entry.
  • Loading branch information
antonha authored and mikemccand committed Apr 2, 2024
1 parent e3bad09 commit a633334
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 8 deletions.
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ Optimizations
* GITHUB#11888: Binary search the BlockTree terms dictionary entries when all suffixes have the same length
in a leaf block, speeding up cases like primary key lookup on an id field when all ids are the same length. (zhouhui)

* GITHUB#13149: Made PointRangeQuery faster, for some segment sizes, by reducing the amount of virtual calls to
IntersectVisitor::visit(int). (Anton Hägerstrand)

Bug Fixes
---------------------

Expand Down
14 changes: 13 additions & 1 deletion lucene/core/src/java/org/apache/lucene/index/PointValues.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ArrayUtil.ByteArrayComparator;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.bkd.BKDConfig;

/**
Expand Down Expand Up @@ -288,7 +289,7 @@ public interface IntersectVisitor {
void visit(int docID) throws IOException;

/**
* Similar to {@link IntersectVisitor#visit(int)}, but a bulk visit and implements may have
* Similar to {@link IntersectVisitor#visit(int)}, but a bulk visit and implementations may have
* their optimizations.
*/
default void visit(DocIdSetIterator iterator) throws IOException {
Expand All @@ -298,6 +299,17 @@ default void visit(DocIdSetIterator iterator) throws IOException {
}
}

/**
* Similar to {@link IntersectVisitor#visit(int)}, but a bulk visit and implements may have
* their optimizations. Even if the implementation does the same thing this method, this may be
* a speed improvement due to fewer virtual calls.
*/
default void visit(IntsRef ref) throws IOException {
for (int i = ref.offset; i < ref.length + ref.offset; i++) {
visit(ref.ints[i]);
}
}

/**
* Called for all documents in a leaf cell that crosses the query. The consumer should
* scrutinize the packedValue to decide whether to accept it. In the 1D case, values are visited
Expand Down
16 changes: 16 additions & 0 deletions lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import org.apache.lucene.util.BitSetIterator;
import org.apache.lucene.util.DocIdSetBuilder;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IntsRef;

/**
* Abstract class for range queries against single or multidimensional points such as {@link
Expand Down Expand Up @@ -185,6 +186,13 @@ public void visit(DocIdSetIterator iterator) throws IOException {
adder.add(iterator);
}

@Override
public void visit(IntsRef ref) {
for (int i = ref.offset; i < ref.offset + ref.length; i++) {
adder.add(ref.ints[i]);
}
}

@Override
public void visit(int docID, byte[] packedValue) {
if (matches(packedValue)) {
Expand Down Expand Up @@ -222,6 +230,14 @@ public void visit(DocIdSetIterator iterator) throws IOException {
cost[0] = Math.max(0, cost[0] - iterator.cost());
}

@Override
public void visit(IntsRef ref) {
for (int i = ref.offset; i < ref.offset + ref.length; i++) {
result.clear(ref.ints[i]);
}
cost[0] -= ref.length;
}

@Override
public void visit(int docID, byte[] packedValue) {
if (matches(packedValue) == false) {
Expand Down
29 changes: 23 additions & 6 deletions lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.DocBaseBitSetIterator;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IntsRef;

final class DocIdsWriter {

Expand All @@ -36,6 +37,22 @@ final class DocIdsWriter {

private final int[] scratch;

/**
* IntsRef to be used to iterate over the scratch buffer. A single instance is reused to avoid
* re-allocating the object. The ints and length fields need to be reset each use.
*
* <p>The main reason for existing is to be able to call the {@link
* IntersectVisitor#visit(IntsRef)} method rather than the {@link IntersectVisitor#visit(int)}
* method. This seems to make a difference in performance, probably due to fewer virtual calls
* then happening (once per read call rather than once per doc).
*/
private final IntsRef scratchIntsRef = new IntsRef();

{
// This is here to not rely on the default constructor of IntsRef to set offset to 0
scratchIntsRef.offset = 0;
}

DocIdsWriter(int maxPointsInLeaf) {
scratch = new int[maxPointsInLeaf];
}
Expand Down Expand Up @@ -318,9 +335,9 @@ private static void readLegacyDeltaVInts(IndexInput in, int count, IntersectVisi

private void readDelta16(IndexInput in, int count, IntersectVisitor visitor) throws IOException {
readDelta16(in, count, scratch);
for (int i = 0; i < count; i++) {
visitor.visit(scratch[i]);
}
scratchIntsRef.ints = scratch;
scratchIntsRef.length = count;
visitor.visit(scratchIntsRef);
}

private static void readInts24(IndexInput in, int count, IntersectVisitor visitor)
Expand All @@ -346,8 +363,8 @@ private static void readInts24(IndexInput in, int count, IntersectVisitor visito

private void readInts32(IndexInput in, int count, IntersectVisitor visitor) throws IOException {
in.readInts(scratch, 0, count);
for (int i = 0; i < count; i++) {
visitor.visit(scratch[i]);
}
scratchIntsRef.ints = scratch;
scratchIntsRef.length = count;
visitor.visit(scratchIntsRef);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,10 @@ public void testRandomLongsMedium() throws Exception {
doTestRandomLongs(1000);
}

public void testRandomLongsBig() throws Exception {
doTestRandomLongs(20_000);
}

private void doTestRandomLongs(int count) throws Exception {

int numValues = TestUtil.nextInt(random(), count, count * 2);
Expand Down Expand Up @@ -434,7 +438,13 @@ private static void verifyLongs(long[] values, int[] ids) throws Exception {
dir = newMaybeVirusCheckingDirectory();
}

int missingPct = random().nextInt(100);
/*
The point range query chooses only considers using an inverse BKD visitor if
there is exactly one value per document. If any document misses a value that
code is not exercised. Using a nextBoolean() here increases the likelihood
that there is no missing values, making the test more likely to test that code.
*/
int missingPct = random().nextBoolean() ? 0 : random().nextInt(100);
int deletedPct = random().nextInt(100);
if (VERBOSE) {
System.out.println(" missingPct=" + missingPct);
Expand Down

0 comments on commit a633334

Please sign in to comment.