Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better paging when random reads go backwards #12357

Merged
merged 8 commits into from
Jun 9, 2023
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,9 @@ Optimizations

* GITHUB#12328: Optimize ConjunctionDISI.createConjunction (Armin Braun)

* GITHUB#12357: Better paging when doing backwards random reads. This speeds up
queries relying on terms in NIOFSDirectory. (Alan Woodward)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
queries relying on terms in NIOFSDirectory. (Alan Woodward)
queries relying on terms in NIOFSDirectory and SimpleFSDirectory. (Alan Woodward)


* GITHUB#12339: Optimize part of duplicate calculation numDeletesToMerge in merge phase (fudongying)

Bug Fixes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ public abstract class BufferedIndexInput extends IndexInput implements RandomAcc
/** A buffer size for merges set to {@value #MERGE_BUFFER_SIZE}. */
public static final int MERGE_BUFFER_SIZE = 4096;

private int bufferSize = BUFFER_SIZE;
private final int bufferSize;
romseygeek marked this conversation as resolved.
Show resolved Hide resolved

private ByteBuffer buffer = EMPTY_BYTEBUFFER;

Expand Down Expand Up @@ -72,7 +72,7 @@ public BufferedIndexInput(String resourceDesc, int bufferSize) {
this.bufferSize = bufferSize;
}

/** Returns buffer size. @see #setBufferSize */
/** Returns buffer size */
public final int getBufferSize() {
return bufferSize;
}
Expand Down Expand Up @@ -159,55 +159,53 @@ public final long readLong() throws IOException {
}
}

@Override
public final byte readByte(long pos) throws IOException {
// Computes an offset into the current buffer from an absolute position to read
// `width` bytes from. If the buffer does not contain the position, then we
// readjust the bufferStart and refill.
private long resolvePositionInBuffer(long pos, int width) throws IOException {
long index = pos - bufferStart;
if (index < 0 || index >= buffer.limit()) {
if (index >= 0 && index < buffer.limit() - width + 1) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: let's save an addition?

Suggested change
if (index >= 0 && index < buffer.limit() - width + 1) {
if (index >= 0 && index <= buffer.limit() - width) {

return index;
}
if (index < 0) {
// if we're moving backwards, then try and fill up the previous page rather than
// starting again at the current pos, to avoid successive backwards reads reloading
// the same data over and over again
bufferStart = Math.min(pos, Math.max(0, bufferStart - bufferSize));
if (bufferStart + bufferSize - pos < width) {
bufferStart += width - 1; // make sure we don't read over the end of the buffer
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: should we update bufferStart in that case so that pos + width ends right at the end of the buffer instead of adding width-1 to bufferStart regardless of the number of bytes that we were missing in the buffer?

Suggested change
bufferStart += width - 1; // make sure we don't read over the end of the buffer
bufferStart = pos + width - bufferSize; // make sure we don't read over the end of the buffer

Maybe this could be folded directly into the min/max computations? E.g. something like (not tested)

bufferStart = bufferStart - bufferSize;
bufferStart = Math.max(bufferStart, pos + width - bufferSize); // make sure we don't read over the end of the buffer
bufferStart = Math.max(bufferStart, 0);
bufferStart = Math.min(bufferStart, pos);

}
} else {
// we're moving forwards, reset the buffer to start at pos
bufferStart = pos;
buffer.limit(0); // trigger refill() on read
seekInternal(pos);
refill();
index = 0;
}
buffer.limit(0); // trigger refill() on read
seekInternal(bufferStart);
refill();
return pos - bufferStart;
}

@Override
public final byte readByte(long pos) throws IOException {
long index = resolvePositionInBuffer(pos, Byte.BYTES);
return buffer.get((int) index);
}

@Override
public final short readShort(long pos) throws IOException {
long index = pos - bufferStart;
if (index < 0 || index >= buffer.limit() - 1) {
bufferStart = pos;
buffer.limit(0); // trigger refill() on read
seekInternal(pos);
refill();
index = 0;
}
long index = resolvePositionInBuffer(pos, Short.BYTES);
return buffer.getShort((int) index);
}

@Override
public final int readInt(long pos) throws IOException {
long index = pos - bufferStart;
if (index < 0 || index >= buffer.limit() - 3) {
bufferStart = pos;
buffer.limit(0); // trigger refill() on read
seekInternal(pos);
refill();
index = 0;
}
long index = resolvePositionInBuffer(pos, Integer.BYTES);
return buffer.getInt((int) index);
}

@Override
public final long readLong(long pos) throws IOException {
long index = pos - bufferStart;
if (index < 0 || index >= buffer.limit() - 7) {
bufferStart = pos;
buffer.limit(0); // trigger refill() on read
seekInternal(pos);
refill();
index = 0;
}
long index = resolvePositionInBuffer(pos, Long.BYTES);
return buffer.getLong((int) index);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Random;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.util.ArrayUtil;
Expand Down Expand Up @@ -142,6 +143,72 @@ public void testEOF() throws Exception {
});
}

// Test that when reading backwards, we page backwards rather than refilling
// on every call
public void testBackwardsByteReads() throws IOException {
MyBufferedIndexInput input = new MyBufferedIndexInput(1024 * 8);
for (int i = 2048; i > 0; i -= random().nextInt(16)) {
assertEquals(byten(i), input.readByte(i));
}
assertEquals(3, input.readCount);
}

public void testBackwardsShortReads() throws IOException {
MyBufferedIndexInput input = new MyBufferedIndexInput(1024 * 8);
ByteBuffer bb = ByteBuffer.allocate(2);
bb.order(ByteOrder.LITTLE_ENDIAN);
for (int i = 2048; i > 0; i -= (random().nextInt(16) + 1)) {
bb.clear();
bb.put(byten(i));
bb.put(byten(i + 1));
assertEquals(bb.getShort(0), input.readShort(i));
}
// readCount can be three or four, depending on whether or not we had to adjust the bufferStart
// to include a whole short
assertTrue(
"Expected 4 or 3, got " + input.readCount, input.readCount == 4 || input.readCount == 3);
}

public void testBackwardsIntReads() throws IOException {
MyBufferedIndexInput input = new MyBufferedIndexInput(1024 * 8);
ByteBuffer bb = ByteBuffer.allocate(4);
bb.order(ByteOrder.LITTLE_ENDIAN);
for (int i = 2048; i > 0; i -= (random().nextInt(16) + 3)) {
bb.clear();
bb.put(byten(i));
bb.put(byten(i + 1));
bb.put(byten(i + 2));
bb.put(byten(i + 3));
assertEquals(bb.getInt(0), input.readInt(i));
}
// readCount can be three or four, depending on whether or not we had to adjust the bufferStart
// to include a whole int
assertTrue(
"Expected 4 or 3, got " + input.readCount, input.readCount == 4 || input.readCount == 3);
}

public void testBackwardsLongReads() throws IOException {
MyBufferedIndexInput input = new MyBufferedIndexInput(1024 * 8);
ByteBuffer bb = ByteBuffer.allocate(8);
bb.order(ByteOrder.LITTLE_ENDIAN);
for (int i = 2048; i > 0; i -= (random().nextInt(16) + 7)) {
bb.clear();
bb.put(byten(i));
bb.put(byten(i + 1));
bb.put(byten(i + 2));
bb.put(byten(i + 3));
bb.put(byten(i + 4));
bb.put(byten(i + 5));
bb.put(byten(i + 6));
bb.put(byten(i + 7));
assertEquals(bb.getLong(0), input.readLong(i));
}
// readCount can be three or four, depending on whether or not we had to adjust the bufferStart
// to include a whole long
assertTrue(
"Expected 4 or 3, got " + input.readCount, input.readCount == 4 || input.readCount == 3);
}

// byten emulates a file - byten(n) returns the n'th byte in that file.
// MyBufferedIndexInput reads this "file".
private static byte byten(long n) {
Expand All @@ -150,7 +217,8 @@ private static byte byten(long n) {

private static class MyBufferedIndexInput extends BufferedIndexInput {
private long pos;
private long len;
private final long len;
private long readCount = 0;

public MyBufferedIndexInput(long len) {
super("MyBufferedIndexInput(len=" + len + ")", BufferedIndexInput.BUFFER_SIZE);
Expand All @@ -164,14 +232,15 @@ public MyBufferedIndexInput() {
}

@Override
protected void readInternal(ByteBuffer b) throws IOException {
protected void readInternal(ByteBuffer b) {
readCount++;
while (b.hasRemaining()) {
b.put(byten(pos++));
}
}

@Override
protected void seekInternal(long pos) throws IOException {
protected void seekInternal(long pos) {
this.pos = pos;
}

Expand Down