mirror of https://github.com/apache/lucene.git
Add ordering of files in compound files (#12241)
Today there is no specific ordering of how files are written to a compound file. The current order is determined by iterating over the set of file names in SegmentInfo, which is undefined. This commit changes to an order based on file size. Colocating data from files that are smaller (typically metadata files like terms index, field info etc...) but accessed often can help when parts of these files are held in cache.
This commit is contained in:
parent
b0befef912
commit
f45e096304
|
@ -176,6 +176,8 @@ Optimizations
|
|||
|
||||
* GITHUB#12198, GITHUB#12199: Reduced contention when indexing with many threads. (Adrien Grand)
|
||||
|
||||
* GITHUB#12241: Add ordering of files in compound files. (Christoph Büscher)
|
||||
|
||||
Bug Fixes
|
||||
---------------------
|
||||
|
||||
|
|
|
@ -27,6 +27,7 @@ import org.apache.lucene.store.DataOutput;
|
|||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
/**
|
||||
* Lucene 9.0 compound file format
|
||||
|
@ -102,11 +103,40 @@ public final class Lucene90CompoundFormat extends CompoundFormat {
|
|||
}
|
||||
}
|
||||
|
||||
private static class SizedFile {
|
||||
private final String name;
|
||||
private final long length;
|
||||
|
||||
private SizedFile(String name, long length) {
|
||||
this.name = name;
|
||||
this.length = length;
|
||||
}
|
||||
}
|
||||
|
||||
private static class SizedFileQueue extends PriorityQueue<SizedFile> {
|
||||
SizedFileQueue(int maxSize) {
|
||||
super(maxSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean lessThan(SizedFile sf1, SizedFile sf2) {
|
||||
return sf1.length < sf2.length;
|
||||
}
|
||||
}
|
||||
|
||||
private void writeCompoundFile(
|
||||
IndexOutput entries, IndexOutput data, Directory dir, SegmentInfo si) throws IOException {
|
||||
// write number of files
|
||||
entries.writeVInt(si.files().size());
|
||||
for (String file : si.files()) {
|
||||
int numFiles = si.files().size();
|
||||
entries.writeVInt(numFiles);
|
||||
// first put files in ascending size order so small files fit more likely into one page
|
||||
SizedFileQueue pq = new SizedFileQueue(numFiles);
|
||||
for (String filename : si.files()) {
|
||||
pq.add(new SizedFile(filename, dir.fileLength(filename)));
|
||||
}
|
||||
while (pq.size() > 0) {
|
||||
SizedFile sizedFile = pq.pop();
|
||||
String file = sizedFile.name;
|
||||
// align file start offset
|
||||
long startOffset = data.alignFilePointer(Long.BYTES);
|
||||
// write bytes for file
|
||||
|
|
|
@ -16,7 +16,17 @@
|
|||
*/
|
||||
package org.apache.lucene.codecs.lucene90;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.tests.index.BaseCompoundFormatTestCase;
|
||||
import org.apache.lucene.tests.util.TestUtil;
|
||||
|
||||
|
@ -27,4 +37,60 @@ public class TestLucene90CompoundFormat extends BaseCompoundFormatTestCase {
|
|||
protected Codec getCodec() {
|
||||
return codec;
|
||||
}
|
||||
|
||||
public void testFileLengthOrdering() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
// Setup the test segment
|
||||
String segment = "_123";
|
||||
int chunk = 1024; // internal buffer size used by the stream
|
||||
SegmentInfo si = newSegmentInfo(dir, segment);
|
||||
byte[] segId = si.getId();
|
||||
List<String> orderedFiles = new ArrayList<>();
|
||||
int randomFileSize = random().nextInt(0, chunk);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
String filename = segment + "." + i;
|
||||
createRandomFile(dir, filename, randomFileSize, segId);
|
||||
// increase the next files size by a random amount
|
||||
randomFileSize += random().nextInt(1, 100);
|
||||
orderedFiles.add(filename);
|
||||
}
|
||||
List<String> shuffledFiles = new ArrayList<>(orderedFiles);
|
||||
Collections.shuffle(shuffledFiles, random());
|
||||
si.setFiles(shuffledFiles);
|
||||
si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT);
|
||||
|
||||
// entries file should contain files ordered by their size
|
||||
String entriesFileName =
|
||||
IndexFileNames.segmentFileName(si.name, "", Lucene90CompoundFormat.ENTRIES_EXTENSION);
|
||||
try (ChecksumIndexInput entriesStream = dir.openChecksumInput(entriesFileName)) {
|
||||
Throwable priorE = null;
|
||||
try {
|
||||
CodecUtil.checkIndexHeader(
|
||||
entriesStream,
|
||||
Lucene90CompoundFormat.ENTRY_CODEC,
|
||||
Lucene90CompoundFormat.VERSION_START,
|
||||
Lucene90CompoundFormat.VERSION_CURRENT,
|
||||
si.getId(),
|
||||
"");
|
||||
final int numEntries = entriesStream.readVInt();
|
||||
long lastOffset = 0;
|
||||
long lastLength = 0;
|
||||
for (int i = 0; i < numEntries; i++) {
|
||||
final String id = entriesStream.readString();
|
||||
assertEquals(orderedFiles.get(i), segment + id);
|
||||
long offset = entriesStream.readLong();
|
||||
assertTrue(offset > lastOffset);
|
||||
lastOffset = offset;
|
||||
long length = entriesStream.readLong();
|
||||
assertTrue(length >= lastLength);
|
||||
lastLength = length;
|
||||
}
|
||||
} catch (Throwable exception) {
|
||||
priorE = exception;
|
||||
} finally {
|
||||
CodecUtil.checkFooter(entriesStream, priorE);
|
||||
}
|
||||
}
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue