Skip to content

Commit

Permalink
Encode dense blocks of postings as bit sets. (#14133)
Browse files Browse the repository at this point in the history
Bit sets can be faster at advancing and more storage-efficient on dense blocks
of postings. This is not a new idea, @mkhludnev proposed something similar a
long time ago #6116.

@msokolov recently brought up (#14080) that such an encoding has become
especially appealing with the introduction of the
`DocIdSetIterator#loadIntoBitSet` API, and the fact that non-scoring
disjunctions and dense conjunctions now take advantage of it. Indeed, if
postings are stored in a bit set, `#loadIntoBitSet` would just need to OR the
postings bits into the bits that are used as an intermediate representation of
matches of the query.
  • Loading branch information
jpountz committed Jan 14, 2025
1 parent 564a997 commit 0b2bf83
Show file tree
Hide file tree
Showing 8 changed files with 315 additions and 132 deletions.
4 changes: 2 additions & 2 deletions lucene/core/src/generated/checksums/generateForDeltaUtil.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForDeltaUtil.java": "e0bf6071bcdefaa297e0bb92f79615201777652d",
"lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForDeltaUtil.py": "d7484ab18da33e5cb73faaf84b4e2bb832b62f9d"
"lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForDeltaUtil.java": "87e4d19b5284fa39adf2c24328cae2076b6f7bb3",
"lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForDeltaUtil.py": "165586f801bef4d2f540521e81bc119880038b6c"
}
Original file line number Diff line number Diff line change
Expand Up @@ -37,23 +37,6 @@ public final class ForDeltaUtil {
private static final int TWO_BLOCK_SIZE_FOURTHS = BLOCK_SIZE / 2;
private static final int THREE_BLOCK_SIZE_FOURTHS = 3 * BLOCK_SIZE / 4;

// IDENTITY_PLUS_ONE[i] == i+1
private static final int[] IDENTITY_PLUS_ONE = new int[ForUtil.BLOCK_SIZE];

static {
for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
IDENTITY_PLUS_ONE[i] = i + 1;
}
}

private static void prefixSumOfOnes(int[] arr, int base) {
System.arraycopy(IDENTITY_PLUS_ONE, 0, arr, 0, ForUtil.BLOCK_SIZE);
// This loop gets auto-vectorized
for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
arr[i] += base;
}
}

private static void prefixSum8(int[] arr, int base) {
// When the number of bits per value is 4 or less, we can sum up all values in a block without
// risking overflowing an 8-bits integer. This allows computing the prefix sum by summing up 4
Expand Down Expand Up @@ -199,43 +182,35 @@ private static void innerPrefixSum16(int[] arr) {
private final int[] tmp = new int[BLOCK_SIZE];

/**
* Encode deltas of a strictly monotonically increasing sequence of integers. The provided {@code
* ints} are expected to be deltas between consecutive values.
* Return the number of bits per value required to store the given array containing strictly
* positive numbers.
*/
void encodeDeltas(int[] ints, DataOutput out) throws IOException {
if (ints[0] == 1 && PForUtil.allEqual(ints)) { // happens with very dense postings
out.writeByte((byte) 0);
} else {
int or = 0;
for (int l : ints) {
or |= l;
}
assert or != 0;
final int bitsPerValue = PackedInts.bitsRequired(or);
out.writeByte((byte) bitsPerValue);

final int primitiveSize;
if (bitsPerValue <= 3) {
primitiveSize = 8;
collapse8(ints);
} else if (bitsPerValue <= 10) {
primitiveSize = 16;
collapse16(ints);
} else {
primitiveSize = 32;
}
encode(ints, bitsPerValue, primitiveSize, out, tmp);
int bitsRequired(int[] ints) {
int or = 0;
for (int l : ints) {
or |= l;
}
// Deltas should be strictly positive since the delta between consecutive doc IDs is at least 1
assert or != 0;
return PackedInts.bitsRequired(or);
}

/** Decode deltas, compute the prefix sum and add {@code base} to all decoded ints. */
void decodeAndPrefixSum(PostingDecodingUtil pdu, int base, int[] ints) throws IOException {
final int bitsPerValue = Byte.toUnsignedInt(pdu.in.readByte());
if (bitsPerValue == 0) {
prefixSumOfOnes(ints, base);
/**
* Encode deltas of a strictly monotonically increasing sequence of integers. The provided {@code
* ints} are expected to be deltas between consecutive values.
*/
void encodeDeltas(int bitsPerValue, int[] ints, DataOutput out) throws IOException {
final int primitiveSize;
if (bitsPerValue <= 3) {
primitiveSize = 8;
collapse8(ints);
} else if (bitsPerValue <= 10) {
primitiveSize = 16;
collapse16(ints);
} else {
decodeAndPrefixSum(bitsPerValue, pdu, base, ints);
primitiveSize = 32;
}
encode(ints, bitsPerValue, primitiveSize, out, tmp);
}

/** Delta-decode 128 integers into {@code ints}. */
Expand Down Expand Up @@ -307,6 +282,9 @@ void decodeAndPrefixSum(int bitsPerValue, PostingDecodingUtil pdu, int base, int
prefixSum32(ints, base);
break;
default:
if (bitsPerValue < 1 || bitsPerValue > Integer.SIZE) {
throw new IllegalStateException("Illegal number of bits per value: " + bitsPerValue);
}
decodeSlow(bitsPerValue, pdu, tmp, ints);
prefixSum32(ints, base);
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -358,8 +358,17 @@ public final class Lucene101PostingsFormat extends PostingsFormat {
static final String PAY_CODEC = "Lucene101PostingsWriterPay";

static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;

/**
* Version that started encoding dense blocks as bit sets. Note: the old format is a subset of the
* new format, so Lucene101PostingsReader is able to read the old format without checking the
* version.
*/
static final int VERSION_DENSE_BLOCKS_AS_BITSETS = 1;

static final int VERSION_CURRENT = VERSION_DENSE_BLOCKS_AS_BITSETS;

private final int version;
private final int minTermBlockSize;
private final int maxTermBlockSize;

Expand All @@ -378,15 +387,24 @@ public Lucene101PostingsFormat() {
* Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)
*/
public Lucene101PostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
this(minTermBlockSize, maxTermBlockSize, VERSION_CURRENT);
}

/** Expert constructor that allows setting the version. */
public Lucene101PostingsFormat(int minTermBlockSize, int maxTermBlockSize, int version) {
super("Lucene101");
if (version < VERSION_START || version > VERSION_CURRENT) {
throw new IllegalArgumentException("Version out of range: " + version);
}
this.version = version;
Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize);
this.minTermBlockSize = minTermBlockSize;
this.maxTermBlockSize = maxTermBlockSize;
}

@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state);
PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state, version);
boolean success = false;
try {
FieldsConsumer ret =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -295,12 +295,37 @@ private static int sumOverRange(int[] arr, int start, int end) {

final class BlockPostingsEnum extends ImpactsEnum {

private enum DeltaEncoding {
/**
* Deltas between consecutive docs are stored as packed integers, ie. the block is encoded
* using Frame Of Reference (FOR).
*/
PACKED,
/**
* Deltas between consecutive docs are stored using unary coding, ie. {@code delta-1} zero
* bits followed by a one bit, ie. the block is encoded as an offset plus a bit set.
*/
UNARY
}

private ForDeltaUtil forDeltaUtil;
private PForUtil pforUtil;

/* Variables that store the content of a block and the current position within this block */
/* Shared variables */
private DeltaEncoding encoding;
private int doc; // doc we last read

/* Variables when the block is stored as packed deltas (Frame Of Reference) */
private final int[] docBuffer = new int[BLOCK_SIZE];

private int doc; // doc we last read
/* Variables when the block is stored as a bit set */
// Since we use a bit set when it's more storage-efficient, the bit set cannot have more than
// BLOCK_SIZE*32 bits, which is the maximum possible storage requirement with FOR.
private final FixedBitSet docBitSet = new FixedBitSet(BLOCK_SIZE * Integer.SIZE);
private int docBitSetBase;
// Reuse docBuffer for cumulative pop counts of the words of the bit set.
private final int[] docCumulativeWordPopCounts = docBuffer;

// level 0 skip data
private int level0LastDocID;
Expand Down Expand Up @@ -572,7 +597,39 @@ public int freq() throws IOException {
}

private void refillFullBlock() throws IOException {
forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer);
int bitsPerValue = docIn.readByte();
if (bitsPerValue > 0) {
// block is encoded as 128 packed integers that record the delta between doc IDs
forDeltaUtil.decodeAndPrefixSum(bitsPerValue, docInUtil, prevDocID, docBuffer);
encoding = DeltaEncoding.PACKED;
} else {
// block is encoded as a bit set
assert level0LastDocID != NO_MORE_DOCS;
docBitSetBase = prevDocID + 1;
int numLongs;
if (bitsPerValue == 0) {
// 0 is used to record that all 128 docs in the block are consecutive
numLongs = BLOCK_SIZE / Long.SIZE; // 2
docBitSet.set(0, BLOCK_SIZE);
} else {
numLongs = -bitsPerValue;
docIn.readLongs(docBitSet.getBits(), 0, numLongs);
}
// Note: we know that BLOCK_SIZE bits are set, so no need to compute the cumulative pop
// count at the last index, it will be BLOCK_SIZE.
// Note: this for loop auto-vectorizes
for (int i = 0; i < numLongs - 1; ++i) {
docCumulativeWordPopCounts[i] = Long.bitCount(docBitSet.getBits()[i]);
}
for (int i = 1; i < numLongs - 1; ++i) {
docCumulativeWordPopCounts[i] += docCumulativeWordPopCounts[i - 1];
}
docCumulativeWordPopCounts[numLongs - 1] = BLOCK_SIZE;
assert docCumulativeWordPopCounts[numLongs - 2]
+ Long.bitCount(docBitSet.getBits()[numLongs - 1])
== BLOCK_SIZE;
encoding = DeltaEncoding.UNARY;
}
if (indexHasFreq) {
if (needsFreq) {
freqFP = docIn.getFilePointer();
Expand Down Expand Up @@ -607,6 +664,7 @@ private void refillRemainder() throws IOException {
prevDocID = docBuffer[BLOCK_SIZE - 1];
docBufferUpto = 0;
posDocBufferUpto = 0;
encoding = DeltaEncoding.PACKED;
assert docBuffer[docBufferSize] == NO_MORE_DOCS;
}

Expand Down Expand Up @@ -727,9 +785,10 @@ private void moveToNextLevel0Block() throws IOException {
if (needsDocsAndFreqsOnly && docCountLeft >= BLOCK_SIZE) {
// Optimize the common path for exhaustive evaluation
long level0NumBytes = docIn.readVLong();
docIn.skipBytes(level0NumBytes);
long level0End = docIn.getFilePointer() + level0NumBytes;
level0LastDocID += readVInt15(docIn);
docIn.seek(level0End);
refillFullBlock();
level0LastDocID = docBuffer[BLOCK_SIZE - 1];
} else {
doMoveToNextLevel0Block();
}
Expand Down Expand Up @@ -857,7 +916,19 @@ public int nextDoc() throws IOException {
moveToNextLevel0Block();
}

return this.doc = docBuffer[docBufferUpto++];
switch (encoding) {
case PACKED:
doc = docBuffer[docBufferUpto];
break;
case UNARY:
int next = docBitSet.nextSetBit(doc - docBitSetBase + 1);
assert next != NO_MORE_DOCS;
doc = docBitSetBase + next;
break;
}

++docBufferUpto;
return this.doc;
}

@Override
Expand All @@ -870,9 +941,30 @@ public int advance(int target) throws IOException {
needsRefilling = false;
}

int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize);
this.doc = docBuffer[next];
docBufferUpto = next + 1;
switch (encoding) {
case PACKED:
{
int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize);
this.doc = docBuffer[next];
docBufferUpto = next + 1;
}
break;
case UNARY:
{
int next = docBitSet.nextSetBit(target - docBitSetBase);
assert next != NO_MORE_DOCS;
this.doc = docBitSetBase + next;
int wordIndex = next >> 6;
// Take the cumulative pop count for the given word, and subtract bits on the left of
// the current doc.
docBufferUpto =
1
+ docCumulativeWordPopCounts[wordIndex]
- Long.bitCount(docBitSet.getBits()[wordIndex] >>> next);
}
break;
}

return doc;
}

Expand All @@ -891,19 +983,53 @@ public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOExcept
moveToNextLevel0Block();
}

int start = docBufferUpto;
int end = computeBufferEndBoundary(upTo);
if (end != 0) {
bufferIntoBitSet(start, end, bitSet, offset);
doc = docBuffer[end - 1];
}
docBufferUpto = end;
switch (encoding) {
case PACKED:
{
int start = docBufferUpto;
int end = computeBufferEndBoundary(upTo);
if (end != 0) {
bufferIntoBitSet(start, end, bitSet, offset);
doc = docBuffer[end - 1];
}
docBufferUpto = end;
if (end != BLOCK_SIZE) {
// Either the block is a tail block, or the block did not fully match, we're done.
nextDoc();
assert doc >= upTo;
return;
}
}
break;
case UNARY:
{
int sourceFrom;
if (docBufferUpto == 0) {
// start from beginning
sourceFrom = 0;
} else {
// start after the current doc
sourceFrom = doc - docBitSetBase + 1;
}

if (end != BLOCK_SIZE) {
// Either the block is a tail block, or the block did not fully match, we're done.
nextDoc();
assert doc >= upTo;
break;
int destFrom = docBitSetBase - offset + sourceFrom;

assert level0LastDocID != NO_MORE_DOCS;
int sourceTo = Math.min(upTo, level0LastDocID + 1) - docBitSetBase;

if (sourceTo > sourceFrom) {
FixedBitSet.orRange(docBitSet, sourceFrom, bitSet, destFrom, sourceTo - sourceFrom);
}
if (docBitSetBase + sourceTo <= level0LastDocID) {
// We stopped before the end of the current bit set, which means that we're done.
// Set the current doc before returning.
advance(docBitSetBase + sourceTo);
return;
}
doc = level0LastDocID;
docBufferUpto = BLOCK_SIZE;
}
break;
}
}
}
Expand Down
Loading

0 comments on commit 0b2bf83

Please sign in to comment.