Turning off patching in Lucene's PFOR encoding in Lucene90PostingsFormat

slow-J · Oct 18, 2023 · 83ec5a8 · 83ec5a8
1 parent f866970
commit 83ec5a8
Show file tree

Hide file tree

Showing 3 changed files with 313 additions and 54 deletions.
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/ForUtilNoP.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/ForUtilNoP.java
@@ -0,0 +1,257 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene90;
+
+import java.io.IOException;
+import java.util.Arrays;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.packed.PackedInts;
+
+/**
+ * Utility class to encode sequences of 128 small positive integers, with same API as PForUtil but
+ * with no exception patching. (PForUtil with exception related items removed.)
+ */
+final class ForUtilNoP {
+
+  private static final int HALF_BLOCK_SIZE = ForUtil.BLOCK_SIZE / 2;
+
+  // IDENTITY_PLUS_ONE[i] == i + 1
+  private static final long[] IDENTITY_PLUS_ONE = new long[ForUtil.BLOCK_SIZE];
+
+  static {
+    for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
+      IDENTITY_PLUS_ONE[i] = i + 1;
+    }
+  }
+
+  static boolean allEqual(long[] l) {
+    for (int i = 1; i < ForUtil.BLOCK_SIZE; ++i) {
+      if (l[i] != l[0]) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  private final ForUtil forUtil;
+
+  ForUtilNoP(ForUtil forUtil) {
+    assert ForUtil.BLOCK_SIZE <= 256 : "blocksize must fit in one byte. got " + ForUtil.BLOCK_SIZE;
+    this.forUtil = forUtil;
+  }
+
+  /** Encode 128 integers from {@code longs} into {@code out}. */
+  void encode(long[] longs, DataOutput out) throws IOException {
+    long topValue = longs[0];
+    for (int i = 1; i < ForUtil.BLOCK_SIZE; ++i) {
+      if (longs[i] > topValue) {
+        topValue = longs[i];
+      }
+    }
+
+    final int maxBitsRequired = PackedInts.bitsRequired(topValue);
+    // We store the patch on a byte, so we can't decrease the number of bits required by more than 8
+    final int patchedBitsRequired =
+        Math.max(PackedInts.bitsRequired(topValue), maxBitsRequired - 8);
+    int numExceptions = 0;
+    final byte[] exceptions = new byte[numExceptions * 2];
+
+    if (allEqual(longs) && maxBitsRequired <= 8) {
+      out.writeByte((byte) (numExceptions << 5));
+      out.writeVLong(longs[0]);
+    } else {
+      final int token = patchedBitsRequired;
+      out.writeByte((byte) token);
+      forUtil.encode(longs, patchedBitsRequired, out);
+    }
+    out.writeBytes(exceptions, exceptions.length);
+  }
+
+  /** Decode 128 integers into {@code ints}. */
+  void decode(DataInput in, long[] longs) throws IOException {
+    final int token = Byte.toUnsignedInt(in.readByte());
+    final int bitsPerValue = token & 0x1f;
+    //    final int numExceptions = token >>> 5;
+    if (bitsPerValue == 0) {
+      Arrays.fill(longs, 0, ForUtil.BLOCK_SIZE, in.readVLong());
+    } else {
+      forUtil.decode(bitsPerValue, in, longs);
+    }
+  }
+
+  /** Decode deltas, compute the prefix sum and add {@code base} to all decoded longs. */
+  void decodeAndPrefixSum(DataInput in, long base, long[] longs) throws IOException {
+    final int token = Byte.toUnsignedInt(in.readByte());
+    final int bitsPerValue = token & 0x1f;
+    // when there are no exceptions to apply, we can be a bit more efficient with our decoding
+    if (bitsPerValue == 0) {
+      // a bpv of zero indicates all delta values are the same
+      long val = in.readVLong();
+      if (val == 1) {
+        // this will often be the common case when working with doc IDs, so we special-case it to
+        // be slightly more efficient
+        prefixSumOfOnes(longs, base);
+      } else {
+        prefixSumOf(longs, base, val);
+      }
+    } else {
+      // decode the deltas then apply the prefix sum logic
+      forUtil.decodeTo32(bitsPerValue, in, longs);
+      prefixSum32(longs, base);
+    }
+  }
+
+  /** Skip 128 integers. */
+  void skip(DataInput in) throws IOException {
+    final int token = Byte.toUnsignedInt(in.readByte());
+    final int bitsPerValue = token & 0x1f;
+    final int numExceptions = 0; // token >>> 5;
+    if (bitsPerValue == 0) {
+      in.readVLong();
+      in.skipBytes(numExceptions);
+    } else {
+      in.skipBytes(forUtil.numBytes(bitsPerValue) + (numExceptions << 1));
+    }
+  }
+
+  /**
+   * Fill {@code longs} with the final values for the case of all deltas being 1. Note this assumes
+   * there are no exceptions to apply.
+   */
+  private static void prefixSumOfOnes(long[] longs, long base) {
+    System.arraycopy(IDENTITY_PLUS_ONE, 0, longs, 0, ForUtil.BLOCK_SIZE);
+    // This loop gets auto-vectorized
+    for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) {
+      longs[i] += base;
+    }
+  }
+
+  /**
+   * Fill {@code longs} with the final values for the case of all deltas being {@code val}. Note
+   * this assumes there are no exceptions to apply.
+   */
+  private static void prefixSumOf(long[] longs, long base, long val) {
+    for (int i = 0; i < ForUtil.BLOCK_SIZE; i++) {
+      longs[i] = (i + 1) * val + base;
+    }
+  }
+
+  /**
+   * Fills the {@code longs} with the provided {@code val}, packed two values per long (using 32
+   * bits per value).
+   */
+  private static void fillSameValue32(long[] longs, long val) {
+    final long token = val << 32 | val;
+    Arrays.fill(longs, 0, HALF_BLOCK_SIZE, token);
+  }
+
+  /** Apply prefix sum logic where the values are packed two-per-long in {@code longs}. */
+  private static void prefixSum32(long[] longs, long base) {
+    longs[0] += base << 32;
+    innerPrefixSum32(longs);
+    expand32(longs);
+    final long l = longs[HALF_BLOCK_SIZE - 1];
+    for (int i = HALF_BLOCK_SIZE; i < ForUtil.BLOCK_SIZE; ++i) {
+      longs[i] += l;
+    }
+  }
+
+  /**
+   * Expand the values packed two-per-long in {@code longs} into 128 individual long values stored
+   * back into {@code longs}.
+   */
+  private static void expand32(long[] longs) {
+    for (int i = 0; i < 64; ++i) {
+      final long l = longs[i];
+      longs[i] = l >>> 32;
+      longs[64 + i] = l & 0xFFFFFFFFL;
+    }
+  }
+
+  /**
+   * Unrolled "inner" prefix sum logic where the values are packed two-per-long in {@code longs}.
+   * After this method, the final values will be correct for all high-order bits (values [0..63])
+   * but a final prefix loop will still need to run to "correct" the values of [64..127] in the
+   * low-order bits, which need the 64th value added to all of them.
+   */
+  private static void innerPrefixSum32(long[] longs) {
+    longs[1] += longs[0];
+    longs[2] += longs[1];
+    longs[3] += longs[2];
+    longs[4] += longs[3];
+    longs[5] += longs[4];
+    longs[6] += longs[5];
+    longs[7] += longs[6];
+    longs[8] += longs[7];
+    longs[9] += longs[8];
+    longs[10] += longs[9];
+    longs[11] += longs[10];
+    longs[12] += longs[11];
+    longs[13] += longs[12];
+    longs[14] += longs[13];
+    longs[15] += longs[14];
+    longs[16] += longs[15];
+    longs[17] += longs[16];
+    longs[18] += longs[17];
+    longs[19] += longs[18];
+    longs[20] += longs[19];
+    longs[21] += longs[20];
+    longs[22] += longs[21];
+    longs[23] += longs[22];
+    longs[24] += longs[23];
+    longs[25] += longs[24];
+    longs[26] += longs[25];
+    longs[27] += longs[26];
+    longs[28] += longs[27];
+    longs[29] += longs[28];
+    longs[30] += longs[29];
+    longs[31] += longs[30];
+    longs[32] += longs[31];
+    longs[33] += longs[32];
+    longs[34] += longs[33];
+    longs[35] += longs[34];
+    longs[36] += longs[35];
+    longs[37] += longs[36];
+    longs[38] += longs[37];
+    longs[39] += longs[38];
+    longs[40] += longs[39];
+    longs[41] += longs[40];
+    longs[42] += longs[41];
+    longs[43] += longs[42];
+    longs[44] += longs[43];
+    longs[45] += longs[44];
+    longs[46] += longs[45];
+    longs[47] += longs[46];
+    longs[48] += longs[47];
+    longs[49] += longs[48];
+    longs[50] += longs[49];
+    longs[51] += longs[50];
+    longs[52] += longs[51];
+    longs[53] += longs[52];
+    longs[54] += longs[53];
+    longs[55] += longs[54];
+    longs[56] += longs[55];
+    longs[57] += longs[56];
+    longs[58] += longs[57];
+    longs[59] += longs[58];
+    longs[60] += longs[59];
+    longs[61] += longs[60];
+    longs[62] += longs[61];
+    longs[63] += longs[62];
+  }
+}