Skip to content

Commit

Permalink
LUCENE-4702: Terms dictionary compression. (#1126)
Browse files Browse the repository at this point in the history
Compress blocks of suffixes in order to make the terms dictionary more
space-efficient. Two compression algorithms are used depending on which one is
more space-efficient:
 - LowercaseAsciiCompression, which applies when all bytes are in the
   `[0x1F,0x3F)` or `[0x5F,0x7F)` ranges, which notably include all digits,
   lowercase ASCII characters, '.', '-' and '_', and encodes 4 chars on 3 bytes.
   It is very often applicable on analyzed content and decompresses very quickly
   thanks to auto-vectorization support in the JVM.
 - LZ4, when the compression ratio is less than 0.75.

I was a bit unhappy with the complexity of the high-compression LZ4 option, so
I simplified it in order to only keep the logic that detects duplicate strings.
The logic about what to do in case overlapping matches are found, which was
responsible for most of the complexity while only yielding tiny benefits, has
been removed.
  • Loading branch information
jpountz committed Jan 27, 2020
1 parent ace4fcc commit 33a7af9
Show file tree
Hide file tree
Showing 25 changed files with 1,467 additions and 802 deletions.
32 changes: 32 additions & 0 deletions lucene/LICENSE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -473,3 +473,35 @@ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

---

core/src/java/org/apache/lucene/util/compress/LZ4.java is a Java
implementation of the LZ4 (https://github.com/lz4/lz4/tree/dev/lib)
compression format for Lucene's DataInput/DataOutput abstractions.

LZ4 Library
Copyright (c) 2011-2016, Yann Collet
All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

5 changes: 5 additions & 0 deletions lucene/NOTICE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ http://bitbucket.org/jpbarrette/moman/overview/
The class org.apache.lucene.util.WeakIdentityMap was derived from
the Apache CXF project and is Apache License 2.0.

The class org.apache.lucene.util.compress.LZ4 is a Java rewrite of the LZ4
compression library (https://github.com/lz4/lz4/tree/dev/lib) that is licensed
under the 2-clause BSD license.
(https://opensource.org/licenses/bsd-license.php)

The Google Code Prettify is Apache License 2.0.
See http://code.google.com/p/google-code-prettify/

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,11 @@ public enum FSTLoadMode {
/** The long[] + byte[] metadata has been replaced with a single byte[]. */
public static final int VERSION_META_LONGS_REMOVED = 4;

/** Suffixes are compressed to save space. */
public static final int VERSION_COMPRESSED_SUFFIXES = 5;

/** Current terms format. */
public static final int VERSION_CURRENT = VERSION_META_LONGS_REMOVED;
public static final int VERSION_CURRENT = VERSION_COMPRESSED_SUFFIXES;

/** Extension of terms index file */
static final String TERMS_INDEX_EXTENSION = "tip";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
Expand All @@ -42,9 +43,12 @@
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.FutureArrays;
import org.apache.lucene.util.FutureObjects;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.compress.LZ4;
import org.apache.lucene.util.compress.LowercaseAsciiCompression;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
Expand Down Expand Up @@ -638,6 +642,16 @@ void writeBlocks(int prefixLength, int count) throws IOException {
newBlocks.clear();
}

private boolean allEqual(byte[] b, int startOffset, int endOffset, byte value) {
FutureObjects.checkFromToIndex(startOffset, endOffset, b.length);
for (int i = startOffset; i < endOffset; ++i) {
if (b[i] != value) {
return false;
}
}
return true;
}

/** Writes the specified slice (start is inclusive, end is exclusive)
* from pending stack as a new block. If isFloor is true, there
* were too many (more than maxItemsInBlock) entries sharing the
Expand Down Expand Up @@ -706,8 +720,8 @@ private PendingBlock writeBlock(int prefixLength, boolean isFloor, int floorLead
//}

// For leaf block we write suffix straight
suffixWriter.writeVInt(suffix);
suffixWriter.writeBytes(term.termBytes, prefixLength, suffix);
suffixLengthsWriter.writeVInt(suffix);
suffixWriter.append(term.termBytes, prefixLength, suffix);
assert floorLeadLabel == -1 || (term.termBytes[prefixLength] & 0xff) >= floorLeadLabel;

// Write term stats, to separate byte[] blob:
Expand All @@ -718,7 +732,7 @@ private PendingBlock writeBlock(int prefixLength, boolean isFloor, int floorLead
}

// Write term meta data
postingsWriter.encodeTerm(bytesWriter, fieldInfo, state, absolute);
postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute);
absolute = false;
}
} else {
Expand All @@ -744,8 +758,8 @@ private PendingBlock writeBlock(int prefixLength, boolean isFloor, int floorLead
// it's a prefix term. Terms cannot be larger than ~32 KB
// so we won't run out of bits:

suffixWriter.writeVInt(suffix << 1);
suffixWriter.writeBytes(term.termBytes, prefixLength, suffix);
suffixLengthsWriter.writeVInt(suffix << 1);
suffixWriter.append(term.termBytes, prefixLength, suffix);

// Write term stats, to separate byte[] blob:
statsWriter.writeVInt(state.docFreq);
Expand All @@ -763,7 +777,7 @@ private PendingBlock writeBlock(int prefixLength, boolean isFloor, int floorLead
// separate anymore:

// Write term meta data
postingsWriter.encodeTerm(bytesWriter, fieldInfo, state, absolute);
postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute);
absolute = false;
} else {
PendingBlock block = (PendingBlock) ent;
Expand All @@ -775,8 +789,8 @@ private PendingBlock writeBlock(int prefixLength, boolean isFloor, int floorLead

// For non-leaf block we borrow 1 bit to record
// if entry is term or sub-block:f
suffixWriter.writeVInt((suffix<<1)|1);
suffixWriter.writeBytes(block.prefix.bytes, prefixLength, suffix);
suffixLengthsWriter.writeVInt((suffix<<1)|1);
suffixWriter.append(block.prefix.bytes, prefixLength, suffix);

//if (DEBUG2) {
// BytesRef suffixBytes = new BytesRef(suffix);
Expand All @@ -788,27 +802,77 @@ private PendingBlock writeBlock(int prefixLength, boolean isFloor, int floorLead
assert floorLeadLabel == -1 || (block.prefix.bytes[prefixLength] & 0xff) >= floorLeadLabel: "floorLeadLabel=" + floorLeadLabel + " suffixLead=" + (block.prefix.bytes[prefixLength] & 0xff);
assert block.fp < startFP;

suffixWriter.writeVLong(startFP - block.fp);
suffixLengthsWriter.writeVLong(startFP - block.fp);
subIndices.add(block.index);
}
}

assert subIndices.size() != 0;
}

// TODO: we could block-write the term suffix pointers;
// this would take more space but would enable binary
// search on lookup

// Write suffixes byte[] blob to terms dict output:
termsOut.writeVInt((int) (suffixWriter.getFilePointer() << 1) | (isLeafBlock ? 1:0));
suffixWriter.writeTo(termsOut);
suffixWriter.reset();
// Write suffixes byte[] blob to terms dict output, either uncompressed, compressed with LZ4 or with LowercaseAsciiCompression.
CompressionAlgorithm compressionAlg = CompressionAlgorithm.NO_COMPRESSION;
// If there are 2 suffix bytes or less per term, then we don't bother compressing as suffix are unlikely what
// makes the terms dictionary large, and it also tends to be frequently the case for dense IDs like
// auto-increment IDs, so not compressing in that case helps not hurt ID lookups by too much.
if (suffixWriter.length() > 2L * numEntries) {
LZ4.compress(suffixWriter.bytes(), 0, suffixWriter.length(), spareWriter, compressionHashTable);
if (spareWriter.getFilePointer() < suffixWriter.length() - (suffixWriter.length() >>> 2)) {
// LZ4 saved more than 25%, go for it
compressionAlg = CompressionAlgorithm.LZ4;
} else {
spareWriter.reset();
if (spareBytes.length < suffixWriter.length()) {
spareBytes = new byte[ArrayUtil.oversize(suffixWriter.length(), 1)];
}
if (LowercaseAsciiCompression.compress(suffixWriter.bytes(), suffixWriter.length(), spareBytes, spareWriter)) {
compressionAlg = CompressionAlgorithm.LOWERCASE_ASCII;
}
}
}
long token = ((long) suffixWriter.length()) << 3;
if (isLeafBlock) {
token |= 0x04;
}
token |= compressionAlg.code;
termsOut.writeVLong(token);
if (compressionAlg == CompressionAlgorithm.NO_COMPRESSION) {
termsOut.writeBytes(suffixWriter.bytes(), suffixWriter.length());
} else {
spareWriter.writeTo(termsOut);
}
suffixWriter.setLength(0);
spareWriter.reset();

// Write suffix lengths
final int numSuffixBytes = Math.toIntExact(suffixLengthsWriter.getFilePointer());
spareBytes = ArrayUtil.grow(spareBytes, numSuffixBytes);
suffixLengthsWriter.writeTo(new ByteArrayDataOutput(spareBytes));
suffixLengthsWriter.reset();
if (allEqual(spareBytes, 1, numSuffixBytes, spareBytes[0])) {
// Structured fields like IDs often have most values of the same length
termsOut.writeVInt((numSuffixBytes << 1) | 1);
termsOut.writeByte(spareBytes[0]);
} else {
// Still give LZ4 a chance, there might be runs of terms with the same length
termsOut.writeVInt(numSuffixBytes << 1);
LZ4.compress(spareBytes, 0, numSuffixBytes, termsOut, compressionHashTable);
}

// Write term stats byte[] blob
termsOut.writeVInt((int) statsWriter.getFilePointer());
statsWriter.writeTo(termsOut);
// Stats
final int numStatsBytes = Math.toIntExact(statsWriter.getFilePointer());
spareBytes = ArrayUtil.grow(spareBytes, numStatsBytes);
statsWriter.writeTo(new ByteArrayDataOutput(spareBytes));
statsWriter.reset();
if (allEqual(spareBytes, 0, numStatsBytes, (byte) 1)) {
// ID fields would typically have blocks full of ones
// LZ4 would optimize this as well but we keep explicit specialization because the decoding logic is a bit faster
termsOut.writeVInt((numStatsBytes << 1) | 1);
} else {
// Still give LZ4 a chance otherwise, there might be runs of ones even if not all values are ones
termsOut.writeVInt(numStatsBytes << 1);
LZ4.compress(spareBytes, 0, numStatsBytes, termsOut, compressionHashTable);
}

// Write term meta data byte[] blob
termsOut.writeVInt((int) metaWriter.getFilePointer());
Expand Down Expand Up @@ -956,9 +1020,13 @@ public void finish() throws IOException {
}
}

private final RAMOutputStream suffixWriter = new RAMOutputStream();
private final RAMOutputStream suffixLengthsWriter = new RAMOutputStream();
private final BytesRefBuilder suffixWriter = new BytesRefBuilder();
private final RAMOutputStream statsWriter = new RAMOutputStream();
private final RAMOutputStream metaWriter = new RAMOutputStream();
private final RAMOutputStream spareWriter = new RAMOutputStream();
private byte[] spareBytes = BytesRef.EMPTY_BYTES;
private final LZ4.HighCompressionHashTable compressionHashTable = new LZ4.HighCompressionHashTable();
}

private boolean closed;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.blocktree;

import java.io.IOException;

import org.apache.lucene.store.DataInput;
import org.apache.lucene.util.compress.LowercaseAsciiCompression;

/**
* Compression algorithm used for suffixes of a block of terms.
*/
enum CompressionAlgorithm {

NO_COMPRESSION(0x00) {

@Override
void read(DataInput in, byte[] out, int len) throws IOException {
in.readBytes(out, 0, len);
}

},

LOWERCASE_ASCII(0x01) {

@Override
void read(DataInput in, byte[] out, int len) throws IOException {
LowercaseAsciiCompression.decompress(in, out, len);
}

},

LZ4(0x02) {

@Override
void read(DataInput in, byte[] out, int len) throws IOException {
org.apache.lucene.util.compress.LZ4.decompress(in, len, out, 0);
}

};

private static final CompressionAlgorithm[] BY_CODE = new CompressionAlgorithm[3];
static {
for (CompressionAlgorithm alg : CompressionAlgorithm.values()) {
BY_CODE[alg.code] = alg;
}
}

/**
* Look up a {@link CompressionAlgorithm} by its {@link CompressionAlgorithm#code}.
*/
static final CompressionAlgorithm byCode(int code) {
if (code < 0 || code >= BY_CODE.length) {
throw new IllegalArgumentException("Illegal code for a compression algorithm: " + code);
}
return BY_CODE[code];
}

public final int code;

private CompressionAlgorithm(int code) {
this.code = code;
}

abstract void read(DataInput in, byte[] out, int len) throws IOException;

}
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ private void seekToStartTerm(BytesRef target) throws IOException {
while (true) {
final int savNextEnt = currentFrame.nextEnt;
final int savePos = currentFrame.suffixesReader.getPosition();
final int saveLengthPos = currentFrame.suffixLengthsReader.getPosition();
final int saveStartBytePos = currentFrame.startBytePos;
final int saveSuffix = currentFrame.suffix;
final long saveLastSubFP = currentFrame.lastSubFP;
Expand Down Expand Up @@ -294,6 +295,7 @@ private void seekToStartTerm(BytesRef target) throws IOException {
currentFrame.startBytePos = saveStartBytePos;
currentFrame.suffix = saveSuffix;
currentFrame.suffixesReader.setPosition(savePos);
currentFrame.suffixLengthsReader.setPosition(saveLengthPos);
currentFrame.termState.termBlockOrd = saveTermBlockOrd;
System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix);
term.length = currentFrame.prefix + currentFrame.suffix;
Expand Down
Loading

0 comments on commit 33a7af9

Please sign in to comment.