Skip to content

Commit

Permalink
Add ordering of files in compound files (#12241)
Browse files Browse the repository at this point in the history
Today there is no specific ordering of how files are written to a compound file.
The current order is determined by iterating over the set of file names in
SegmentInfo, which is undefined. This commit changes to an order based
on file size. Colocating data from files that are smaller (typically metadata
files like terms index, field info etc...) but accessed often can help when
parts of these files are held in cache.
  • Loading branch information
cbuescher authored and romseygeek committed Apr 26, 2023
1 parent 615f456 commit 246ac4b
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 2 deletions.
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ Optimizations

* GITHUB#12198, GITHUB#12199: Reduced contention when indexing with many threads. (Adrien Grand)

* GITHUB#12241: Add ordering of files in compound files. (Christoph Büscher)

Bug Fixes
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.PriorityQueue;

/**
* Lucene 9.0 compound file format
Expand Down Expand Up @@ -102,11 +103,40 @@ public void write(Directory dir, SegmentInfo si, IOContext context) throws IOExc
}
}

private static class SizedFile {
private final String name;
private final long length;

private SizedFile(String name, long length) {
this.name = name;
this.length = length;
}
}

private static class SizedFileQueue extends PriorityQueue<SizedFile> {
SizedFileQueue(int maxSize) {
super(maxSize);
}

@Override
protected boolean lessThan(SizedFile sf1, SizedFile sf2) {
return sf1.length < sf2.length;
}
}

private void writeCompoundFile(
IndexOutput entries, IndexOutput data, Directory dir, SegmentInfo si) throws IOException {
// write number of files
entries.writeVInt(si.files().size());
for (String file : si.files()) {
int numFiles = si.files().size();
entries.writeVInt(numFiles);
// first put files in ascending size order so small files fit more likely into one page
SizedFileQueue pq = new SizedFileQueue(numFiles);
for (String filename : si.files()) {
pq.add(new SizedFile(filename, dir.fileLength(filename)));
}
while (pq.size() > 0) {
SizedFile sizedFile = pq.pop();
String file = sizedFile.name;
// align file start offset
long startOffset = data.alignFilePointer(Long.BYTES);
// write bytes for file
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,17 @@
*/
package org.apache.lucene.codecs.lucene90;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.tests.index.BaseCompoundFormatTestCase;
import org.apache.lucene.tests.util.TestUtil;

Expand All @@ -27,4 +37,61 @@ public class TestLucene90CompoundFormat extends BaseCompoundFormatTestCase {
protected Codec getCodec() {
return codec;
}

public void testFileLengthOrdering() throws IOException {
Directory dir = newDirectory();
// Setup the test segment
String segment = "_123";
int chunk = 1024; // internal buffer size used by the stream
SegmentInfo si = newSegmentInfo(dir, segment);
byte[] segId = si.getId();
List<String> orderedFiles = new ArrayList<>();
int randomFileSize = random().nextInt(chunk);
for (int i = 0; i < 10; i++) {
String filename = segment + "." + i;
createRandomFile(dir, filename, randomFileSize, segId);
// increase the next files size by a random amount
randomFileSize += random().nextInt(100) + 1;
orderedFiles.add(filename);
}
List<String> shuffledFiles = new ArrayList<>(orderedFiles);
Collections.shuffle(shuffledFiles, random());
si.setFiles(shuffledFiles);
si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT);

// entries file should contain files ordered by their size
String entriesFileName =
IndexFileNames.segmentFileName(si.name, "", Lucene90CompoundFormat.ENTRIES_EXTENSION);
try (ChecksumIndexInput entriesStream =
dir.openChecksumInput(entriesFileName, IOContext.READ)) {
Throwable priorE = null;
try {
CodecUtil.checkIndexHeader(
entriesStream,
Lucene90CompoundFormat.ENTRY_CODEC,
Lucene90CompoundFormat.VERSION_START,
Lucene90CompoundFormat.VERSION_CURRENT,
si.getId(),
"");
final int numEntries = entriesStream.readVInt();
long lastOffset = 0;
long lastLength = 0;
for (int i = 0; i < numEntries; i++) {
final String id = entriesStream.readString();
assertEquals(orderedFiles.get(i), segment + id);
long offset = entriesStream.readLong();
assertTrue(offset > lastOffset);
lastOffset = offset;
long length = entriesStream.readLong();
assertTrue(length >= lastLength);
lastLength = length;
}
} catch (Throwable exception) {
priorE = exception;
} finally {
CodecUtil.checkFooter(entriesStream, priorE);
}
}
dir.close();
}
}

0 comments on commit 246ac4b

Please sign in to comment.