diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index a1d2ed19d44e..77aa0596348a 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -69,6 +69,8 @@ Optimizations * GITHUB#12198, GITHUB#12199: Reduced contention when indexing with many threads. (Adrien Grand) +* GITHUB#12241: Add ordering of files in compound files. (Christoph Büscher) + Bug Fixes --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundFormat.java index bca1721b0e8a..72d83853879b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundFormat.java @@ -27,6 +27,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.PriorityQueue; /** * Lucene 9.0 compound file format @@ -102,11 +103,40 @@ public void write(Directory dir, SegmentInfo si, IOContext context) throws IOExc } } + private static class SizedFile { + private final String name; + private final long length; + + private SizedFile(String name, long length) { + this.name = name; + this.length = length; + } + } + + private static class SizedFileQueue extends PriorityQueue { + SizedFileQueue(int maxSize) { + super(maxSize); + } + + @Override + protected boolean lessThan(SizedFile sf1, SizedFile sf2) { + return sf1.length < sf2.length; + } + } + private void writeCompoundFile( IndexOutput entries, IndexOutput data, Directory dir, SegmentInfo si) throws IOException { // write number of files - entries.writeVInt(si.files().size()); - for (String file : si.files()) { + int numFiles = si.files().size(); + entries.writeVInt(numFiles); + // first put files in ascending size order so small files fit more likely into one page + SizedFileQueue pq = new SizedFileQueue(numFiles); + for (String filename : si.files()) { + pq.add(new SizedFile(filename, dir.fileLength(filename))); + } + while (pq.size() > 0) { + SizedFile sizedFile = pq.pop(); + String file = sizedFile.name; // align file start offset long startOffset = data.alignFilePointer(Long.BYTES); // write bytes for file diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90CompoundFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90CompoundFormat.java index 506462f6d6b8..eccb7f3ceb83 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90CompoundFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90CompoundFormat.java @@ -16,7 +16,17 @@ */ package org.apache.lucene.codecs.lucene90; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; import org.apache.lucene.tests.index.BaseCompoundFormatTestCase; import org.apache.lucene.tests.util.TestUtil; @@ -27,4 +37,61 @@ public class TestLucene90CompoundFormat extends BaseCompoundFormatTestCase { protected Codec getCodec() { return codec; } + + public void testFileLengthOrdering() throws IOException { + Directory dir = newDirectory(); + // Setup the test segment + String segment = "_123"; + int chunk = 1024; // internal buffer size used by the stream + SegmentInfo si = newSegmentInfo(dir, segment); + byte[] segId = si.getId(); + List orderedFiles = new ArrayList<>(); + int randomFileSize = random().nextInt(chunk); + for (int i = 0; i < 10; i++) { + String filename = segment + "." + i; + createRandomFile(dir, filename, randomFileSize, segId); + // increase the next files size by a random amount + randomFileSize += random().nextInt(100) + 1; + orderedFiles.add(filename); + } + List shuffledFiles = new ArrayList<>(orderedFiles); + Collections.shuffle(shuffledFiles, random()); + si.setFiles(shuffledFiles); + si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); + + // entries file should contain files ordered by their size + String entriesFileName = + IndexFileNames.segmentFileName(si.name, "", Lucene90CompoundFormat.ENTRIES_EXTENSION); + try (ChecksumIndexInput entriesStream = + dir.openChecksumInput(entriesFileName, IOContext.READ)) { + Throwable priorE = null; + try { + CodecUtil.checkIndexHeader( + entriesStream, + Lucene90CompoundFormat.ENTRY_CODEC, + Lucene90CompoundFormat.VERSION_START, + Lucene90CompoundFormat.VERSION_CURRENT, + si.getId(), + ""); + final int numEntries = entriesStream.readVInt(); + long lastOffset = 0; + long lastLength = 0; + for (int i = 0; i < numEntries; i++) { + final String id = entriesStream.readString(); + assertEquals(orderedFiles.get(i), segment + id); + long offset = entriesStream.readLong(); + assertTrue(offset > lastOffset); + lastOffset = offset; + long length = entriesStream.readLong(); + assertTrue(length >= lastLength); + lastLength = length; + } + } catch (Throwable exception) { + priorE = exception; + } finally { + CodecUtil.checkFooter(entriesStream, priorE); + } + } + dir.close(); + } }