LUCENE-4702: Terms dictionary compression. (#1126)

Compress blocks of suffixes in order to make the terms dictionary more space-efficient. Two compression algorithms are used depending on which one is more space-efficient: - LowercaseAsciiCompression, which applies when all bytes are in the `[0x1F,0x3F)` or `[0x5F,0x7F)` ranges, which notably include all digits, lowercase ASCII characters, '.', '-' and '_', and encodes 4 chars on 3 bytes. It is very often applicable on analyzed content and decompresses very quickly thanks to auto-vectorization support in the JVM. - LZ4, when the compression ratio is less than 0.75. I was a bit unhappy with the complexity of the high-compression LZ4 option, so I simplified it in order to only keep the logic that detects duplicate strings. The logic about what to do in case overlapping matches are found, which was responsible for most of the complexity while only yielding tiny benefits, has been removed.
apache · Jan 27, 2020 · 33a7af9 · 33a7af9
1 parent ace4fcc
commit 33a7af9
Show file tree

Hide file tree

Showing 25 changed files with 1,467 additions and 802 deletions.
diff --git a/lucene/LICENSE.txt b/lucene/LICENSE.txt
@@ -473,3 +473,35 @@ BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF LIABILITY,
 WHETHER IN  CONTRACT, STRICT LIABILITY, OR  TORT (INCLUDING NEGLIGENCE
 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
 IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+---
+
+core/src/java/org/apache/lucene/util/compress/LZ4.java is a Java
+implementation of the LZ4 (https://github.com/lz4/lz4/tree/dev/lib)
+compression format for Lucene's DataInput/DataOutput abstractions.
+
+LZ4 Library
+Copyright (c) 2011-2016, Yann Collet
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/lucene/NOTICE.txt b/lucene/NOTICE.txt
@@ -30,6 +30,11 @@ http://bitbucket.org/jpbarrette/moman/overview/
 The class org.apache.lucene.util.WeakIdentityMap was derived from
 the Apache CXF project and is Apache License 2.0.
 
+The class org.apache.lucene.util.compress.LZ4 is a Java rewrite of the LZ4
+compression library (https://github.com/lz4/lz4/tree/dev/lib) that is licensed
+under the 2-clause BSD license.
+(https://opensource.org/licenses/bsd-license.php)
+
 The Google Code Prettify is Apache License 2.0.
 See http://code.google.com/p/google-code-prettify/
 

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
@@ -131,8 +131,11 @@ public enum FSTLoadMode {
   /** The long[] + byte[] metadata has been replaced with a single byte[]. */
   public static final int VERSION_META_LONGS_REMOVED = 4;
 
+  /** Suffixes are compressed to save space. */
+  public static final int VERSION_COMPRESSED_SUFFIXES = 5;
+
   /** Current terms format. */
-  public static final int VERSION_CURRENT = VERSION_META_LONGS_REMOVED;
+  public static final int VERSION_CURRENT = VERSION_COMPRESSED_SUFFIXES;
 
   /** Extension of terms index file */
   static final String TERMS_INDEX_EXTENSION = "tip";

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java
@@ -34,6 +34,7 @@
 import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.ByteArrayDataOutput;
 import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.store.RAMOutputStream;
@@ -42,9 +43,12 @@
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.FutureArrays;
+import org.apache.lucene.util.FutureObjects;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.StringHelper;
+import org.apache.lucene.util.compress.LZ4;
+import org.apache.lucene.util.compress.LowercaseAsciiCompression;
 import org.apache.lucene.util.fst.Builder;
 import org.apache.lucene.util.fst.ByteSequenceOutputs;
 import org.apache.lucene.util.fst.BytesRefFSTEnum;
@@ -638,6 +642,16 @@ void writeBlocks(int prefixLength, int count) throws IOException {
       newBlocks.clear();
     }
 
+    private boolean allEqual(byte[] b, int startOffset, int endOffset, byte value) {
+      FutureObjects.checkFromToIndex(startOffset, endOffset, b.length);
+      for (int i = startOffset; i < endOffset; ++i) {
+        if (b[i] != value) {
+          return false;
+        }
+      }
+      return true;
+    }
+
     /** Writes the specified slice (start is inclusive, end is exclusive)
      *  from pending stack as a new block.  If isFloor is true, there
      *  were too many (more than maxItemsInBlock) entries sharing the
@@ -706,8 +720,8 @@ private PendingBlock writeBlock(int prefixLength, boolean isFloor, int floorLead
           //}
 
           // For leaf block we write suffix straight
-          suffixWriter.writeVInt(suffix);
-          suffixWriter.writeBytes(term.termBytes, prefixLength, suffix);
+          suffixLengthsWriter.writeVInt(suffix);
+          suffixWriter.append(term.termBytes, prefixLength, suffix);
           assert floorLeadLabel == -1 || (term.termBytes[prefixLength] & 0xff) >= floorLeadLabel;
 
           // Write term stats, to separate byte[] blob:
@@ -718,7 +732,7 @@ private PendingBlock writeBlock(int prefixLength, boolean isFloor, int floorLead
           }
 
           // Write term meta data
-          postingsWriter.encodeTerm(bytesWriter, fieldInfo, state, absolute);
+          postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute);
           absolute = false;
         }
       } else {
@@ -744,8 +758,8 @@ private PendingBlock writeBlock(int prefixLength, boolean isFloor, int floorLead
             // it's a prefix term.  Terms cannot be larger than ~32 KB
             // so we won't run out of bits:
 
-            suffixWriter.writeVInt(suffix << 1);
-            suffixWriter.writeBytes(term.termBytes, prefixLength, suffix);
+            suffixLengthsWriter.writeVInt(suffix << 1);
+            suffixWriter.append(term.termBytes, prefixLength, suffix);
 
             // Write term stats, to separate byte[] blob:
             statsWriter.writeVInt(state.docFreq);
@@ -763,7 +777,7 @@ private PendingBlock writeBlock(int prefixLength, boolean isFloor, int floorLead
             // separate anymore:
 
             // Write term meta data
-            postingsWriter.encodeTerm(bytesWriter, fieldInfo, state, absolute);
+            postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute);
             absolute = false;
           } else {
             PendingBlock block = (PendingBlock) ent;
@@ -775,8 +789,8 @@ private PendingBlock writeBlock(int prefixLength, boolean isFloor, int floorLead
 
             // For non-leaf block we borrow 1 bit to record
             // if entry is term or sub-block:f
-            suffixWriter.writeVInt((suffix<<1)|1);
-            suffixWriter.writeBytes(block.prefix.bytes, prefixLength, suffix);
+            suffixLengthsWriter.writeVInt((suffix<<1)|1);
+            suffixWriter.append(block.prefix.bytes, prefixLength, suffix);
 
             //if (DEBUG2) {
             //  BytesRef suffixBytes = new BytesRef(suffix);
@@ -788,27 +802,77 @@ private PendingBlock writeBlock(int prefixLength, boolean isFloor, int floorLead
             assert floorLeadLabel == -1 || (block.prefix.bytes[prefixLength] & 0xff) >= floorLeadLabel: "floorLeadLabel=" + floorLeadLabel + " suffixLead=" + (block.prefix.bytes[prefixLength] & 0xff);
             assert block.fp < startFP;
 
-            suffixWriter.writeVLong(startFP - block.fp);
+            suffixLengthsWriter.writeVLong(startFP - block.fp);
             subIndices.add(block.index);
           }
         }
 
         assert subIndices.size() != 0;
       }
 
-      // TODO: we could block-write the term suffix pointers;
-      // this would take more space but would enable binary
-      // search on lookup
-
-      // Write suffixes byte[] blob to terms dict output:
-      termsOut.writeVInt((int) (suffixWriter.getFilePointer() << 1) | (isLeafBlock ? 1:0));
-      suffixWriter.writeTo(termsOut);
-      suffixWriter.reset();
+      // Write suffixes byte[] blob to terms dict output, either uncompressed, compressed with LZ4 or with LowercaseAsciiCompression.
+      CompressionAlgorithm compressionAlg = CompressionAlgorithm.NO_COMPRESSION;
+      // If there are 2 suffix bytes or less per term, then we don't bother compressing as suffix are unlikely what
+      // makes the terms dictionary large, and it also tends to be frequently the case for dense IDs like
+      // auto-increment IDs, so not compressing in that case helps not hurt ID lookups by too much.
+      if (suffixWriter.length() > 2L * numEntries) {
+        LZ4.compress(suffixWriter.bytes(), 0, suffixWriter.length(), spareWriter, compressionHashTable);
+        if (spareWriter.getFilePointer() < suffixWriter.length() - (suffixWriter.length() >>> 2)) {
+          // LZ4 saved more than 25%, go for it
+          compressionAlg = CompressionAlgorithm.LZ4;
+        } else {
+          spareWriter.reset();
+          if (spareBytes.length < suffixWriter.length()) {
+            spareBytes = new byte[ArrayUtil.oversize(suffixWriter.length(), 1)];
+          }
+          if (LowercaseAsciiCompression.compress(suffixWriter.bytes(), suffixWriter.length(), spareBytes, spareWriter)) {
+            compressionAlg = CompressionAlgorithm.LOWERCASE_ASCII;
+          }
+        }
+      }
+      long token = ((long) suffixWriter.length()) << 3;
+      if (isLeafBlock) {
+        token |= 0x04;
+      }
+      token |= compressionAlg.code;
+      termsOut.writeVLong(token);
+      if (compressionAlg == CompressionAlgorithm.NO_COMPRESSION) {
+        termsOut.writeBytes(suffixWriter.bytes(), suffixWriter.length());
+      } else {
+        spareWriter.writeTo(termsOut);
+      }
+      suffixWriter.setLength(0);
+      spareWriter.reset();
+
+      // Write suffix lengths
+      final int numSuffixBytes = Math.toIntExact(suffixLengthsWriter.getFilePointer());
+      spareBytes = ArrayUtil.grow(spareBytes, numSuffixBytes);
+      suffixLengthsWriter.writeTo(new ByteArrayDataOutput(spareBytes));
+      suffixLengthsWriter.reset();
+      if (allEqual(spareBytes, 1, numSuffixBytes, spareBytes[0])) {
+        // Structured fields like IDs often have most values of the same length
+        termsOut.writeVInt((numSuffixBytes << 1) | 1);
+        termsOut.writeByte(spareBytes[0]);
+      } else {
+        // Still give LZ4 a chance, there might be runs of terms with the same length
+        termsOut.writeVInt(numSuffixBytes << 1);
+        LZ4.compress(spareBytes, 0, numSuffixBytes, termsOut, compressionHashTable);
+      }
 
-      // Write term stats byte[] blob
-      termsOut.writeVInt((int) statsWriter.getFilePointer());
-      statsWriter.writeTo(termsOut);
+      // Stats
+      final int numStatsBytes = Math.toIntExact(statsWriter.getFilePointer());
+      spareBytes = ArrayUtil.grow(spareBytes, numStatsBytes);
+      statsWriter.writeTo(new ByteArrayDataOutput(spareBytes));
       statsWriter.reset();
+      if (allEqual(spareBytes, 0, numStatsBytes, (byte) 1)) {
+        // ID fields would typically have blocks full of ones
+        // LZ4 would optimize this as well but we keep explicit specialization because the decoding logic is a bit faster
+        termsOut.writeVInt((numStatsBytes << 1) | 1);
+      } else {
+        // Still give LZ4 a chance otherwise, there might be runs of ones even if not all values are ones
+        termsOut.writeVInt(numStatsBytes << 1);
+        LZ4.compress(spareBytes, 0, numStatsBytes, termsOut, compressionHashTable);
+      }
 
       // Write term meta data byte[] blob
       termsOut.writeVInt((int) metaWriter.getFilePointer());
@@ -956,9 +1020,13 @@ public void finish() throws IOException {
       }
     }
 
-    private final RAMOutputStream suffixWriter = new RAMOutputStream();
+    private final RAMOutputStream suffixLengthsWriter = new RAMOutputStream();
+    private final BytesRefBuilder suffixWriter = new BytesRefBuilder();
     private final RAMOutputStream statsWriter = new RAMOutputStream();
     private final RAMOutputStream metaWriter = new RAMOutputStream();
+    private final RAMOutputStream spareWriter = new RAMOutputStream();
+    private byte[] spareBytes = BytesRef.EMPTY_BYTES;
+    private final LZ4.HighCompressionHashTable compressionHashTable = new LZ4.HighCompressionHashTable();
   }
 
   private boolean closed;

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/CompressionAlgorithm.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/CompressionAlgorithm.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.blocktree;
+
+import java.io.IOException;
+
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.util.compress.LowercaseAsciiCompression;
+
+/**
+ * Compression algorithm used for suffixes of a block of terms.
+ */
+enum CompressionAlgorithm {
+
+  NO_COMPRESSION(0x00) {
+
+    @Override
+    void read(DataInput in, byte[] out, int len) throws IOException {
+      in.readBytes(out, 0, len);
+    }
+
+  },
+
+  LOWERCASE_ASCII(0x01) {
+
+    @Override
+    void read(DataInput in, byte[] out, int len) throws IOException {
+      LowercaseAsciiCompression.decompress(in, out, len);
+    }
+
+  },
+
+  LZ4(0x02) {
+
+    @Override
+    void read(DataInput in, byte[] out, int len) throws IOException {
+      org.apache.lucene.util.compress.LZ4.decompress(in, len, out, 0);
+    }
+
+  };
+
+  private static final CompressionAlgorithm[] BY_CODE = new CompressionAlgorithm[3];
+  static {
+    for (CompressionAlgorithm alg : CompressionAlgorithm.values()) {
+      BY_CODE[alg.code] = alg;
+    }
+  }
+
+  /**
+   * Look up a {@link CompressionAlgorithm} by its {@link CompressionAlgorithm#code}.
+   */
+  static final CompressionAlgorithm byCode(int code) {
+    if (code < 0 || code >= BY_CODE.length) {
+      throw new IllegalArgumentException("Illegal code for a compression algorithm: " + code);
+    }
+    return BY_CODE[code];
+  }
+
+  public final int code;
+
+  private CompressionAlgorithm(int code) {
+    this.code = code;
+  }
+
+  abstract void read(DataInput in, byte[] out, int len) throws IOException;
+
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java
@@ -252,6 +252,7 @@ private void seekToStartTerm(BytesRef target) throws IOException {
       while (true) {
         final int savNextEnt = currentFrame.nextEnt;
         final int savePos = currentFrame.suffixesReader.getPosition();
+        final int saveLengthPos = currentFrame.suffixLengthsReader.getPosition();
         final int saveStartBytePos = currentFrame.startBytePos;
         final int saveSuffix = currentFrame.suffix;
         final long saveLastSubFP = currentFrame.lastSubFP;
@@ -294,6 +295,7 @@ private void seekToStartTerm(BytesRef target) throws IOException {
             currentFrame.startBytePos = saveStartBytePos;
             currentFrame.suffix = saveSuffix;
             currentFrame.suffixesReader.setPosition(savePos);
+            currentFrame.suffixLengthsReader.setPosition(saveLengthPos);
             currentFrame.termState.termBlockOrd = saveTermBlockOrd;
             System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix);
             term.length = currentFrame.prefix + currentFrame.suffix;