Add ordering of files in compound files (#12241)

Today there is no specific ordering of how files are written to a compound file.
The current order is determined by iterating over the set of file names in
SegmentInfo, which is undefined. This commit changes to an order based
on file size. Colocating data from files that are smaller (typically metadata
files like terms index, field info etc...) but accessed often can help when
parts of these files are held in cache.
This commit is contained in:
Christoph Büscher 2023-04-26 15:01:02 +02:00 committed by GitHub
parent b0befef912
commit f45e096304
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 100 additions and 2 deletions

View File

@ -176,6 +176,8 @@ Optimizations
* GITHUB#12198, GITHUB#12199: Reduced contention when indexing with many threads. (Adrien Grand)
* GITHUB#12241: Add ordering of files in compound files. (Christoph Büscher)
Bug Fixes
---------------------

View File

@ -27,6 +27,7 @@ import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.PriorityQueue;
/**
* Lucene 9.0 compound file format
@ -102,11 +103,40 @@ public final class Lucene90CompoundFormat extends CompoundFormat {
}
}
private static class SizedFile {
private final String name;
private final long length;
private SizedFile(String name, long length) {
this.name = name;
this.length = length;
}
}
private static class SizedFileQueue extends PriorityQueue<SizedFile> {
SizedFileQueue(int maxSize) {
super(maxSize);
}
@Override
protected boolean lessThan(SizedFile sf1, SizedFile sf2) {
return sf1.length < sf2.length;
}
}
private void writeCompoundFile(
IndexOutput entries, IndexOutput data, Directory dir, SegmentInfo si) throws IOException {
// write number of files
entries.writeVInt(si.files().size());
for (String file : si.files()) {
int numFiles = si.files().size();
entries.writeVInt(numFiles);
// first put files in ascending size order so small files fit more likely into one page
SizedFileQueue pq = new SizedFileQueue(numFiles);
for (String filename : si.files()) {
pq.add(new SizedFile(filename, dir.fileLength(filename)));
}
while (pq.size() > 0) {
SizedFile sizedFile = pq.pop();
String file = sizedFile.name;
// align file start offset
long startOffset = data.alignFilePointer(Long.BYTES);
// write bytes for file

View File

@ -16,7 +16,17 @@
*/
package org.apache.lucene.codecs.lucene90;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.tests.index.BaseCompoundFormatTestCase;
import org.apache.lucene.tests.util.TestUtil;
@ -27,4 +37,60 @@ public class TestLucene90CompoundFormat extends BaseCompoundFormatTestCase {
protected Codec getCodec() {
return codec;
}
public void testFileLengthOrdering() throws IOException {
Directory dir = newDirectory();
// Setup the test segment
String segment = "_123";
int chunk = 1024; // internal buffer size used by the stream
SegmentInfo si = newSegmentInfo(dir, segment);
byte[] segId = si.getId();
List<String> orderedFiles = new ArrayList<>();
int randomFileSize = random().nextInt(0, chunk);
for (int i = 0; i < 10; i++) {
String filename = segment + "." + i;
createRandomFile(dir, filename, randomFileSize, segId);
// increase the next files size by a random amount
randomFileSize += random().nextInt(1, 100);
orderedFiles.add(filename);
}
List<String> shuffledFiles = new ArrayList<>(orderedFiles);
Collections.shuffle(shuffledFiles, random());
si.setFiles(shuffledFiles);
si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT);
// entries file should contain files ordered by their size
String entriesFileName =
IndexFileNames.segmentFileName(si.name, "", Lucene90CompoundFormat.ENTRIES_EXTENSION);
try (ChecksumIndexInput entriesStream = dir.openChecksumInput(entriesFileName)) {
Throwable priorE = null;
try {
CodecUtil.checkIndexHeader(
entriesStream,
Lucene90CompoundFormat.ENTRY_CODEC,
Lucene90CompoundFormat.VERSION_START,
Lucene90CompoundFormat.VERSION_CURRENT,
si.getId(),
"");
final int numEntries = entriesStream.readVInt();
long lastOffset = 0;
long lastLength = 0;
for (int i = 0; i < numEntries; i++) {
final String id = entriesStream.readString();
assertEquals(orderedFiles.get(i), segment + id);
long offset = entriesStream.readLong();
assertTrue(offset > lastOffset);
lastOffset = offset;
long length = entriesStream.readLong();
assertTrue(length >= lastLength);
lastLength = length;
}
} catch (Throwable exception) {
priorE = exception;
} finally {
CodecUtil.checkFooter(entriesStream, priorE);
}
}
dir.close();
}
}