From f45e0963040f8a7d23be3751b07b490d480782e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=BCscher?= Date: Wed, 26 Apr 2023 15:01:02 +0200 Subject: [PATCH] Add ordering of files in compound files (#12241) Today there is no specific ordering of how files are written to a compound file. The current order is determined by iterating over the set of file names in SegmentInfo, which is undefined. This commit changes to an order based on file size. Colocating data from files that are smaller (typically metadata files like terms index, field info etc...) but accessed often can help when parts of these files are held in cache. --- lucene/CHANGES.txt | 2 + .../lucene90/Lucene90CompoundFormat.java | 34 +++++++++- .../lucene90/TestLucene90CompoundFormat.java | 66 +++++++++++++++++++ 3 files changed, 100 insertions(+), 2 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 91f1ea039a7..103e714cb74 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -176,6 +176,8 @@ Optimizations * GITHUB#12198, GITHUB#12199: Reduced contention when indexing with many threads. (Adrien Grand) +* GITHUB#12241: Add ordering of files in compound files. (Christoph Büscher) + Bug Fixes --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundFormat.java index 797c2520a74..b486a6536cc 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundFormat.java @@ -27,6 +27,7 @@ import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.PriorityQueue; /** * Lucene 9.0 compound file format @@ -102,11 +103,40 @@ public final class Lucene90CompoundFormat extends CompoundFormat { } } + private static class SizedFile { + private final String name; + private final long length; + + private SizedFile(String name, long length) { + this.name = name; + this.length = length; + } + } + + private static class SizedFileQueue extends PriorityQueue { + SizedFileQueue(int maxSize) { + super(maxSize); + } + + @Override + protected boolean lessThan(SizedFile sf1, SizedFile sf2) { + return sf1.length < sf2.length; + } + } + private void writeCompoundFile( IndexOutput entries, IndexOutput data, Directory dir, SegmentInfo si) throws IOException { // write number of files - entries.writeVInt(si.files().size()); - for (String file : si.files()) { + int numFiles = si.files().size(); + entries.writeVInt(numFiles); + // first put files in ascending size order so small files fit more likely into one page + SizedFileQueue pq = new SizedFileQueue(numFiles); + for (String filename : si.files()) { + pq.add(new SizedFile(filename, dir.fileLength(filename))); + } + while (pq.size() > 0) { + SizedFile sizedFile = pq.pop(); + String file = sizedFile.name; // align file start offset long startOffset = data.alignFilePointer(Long.BYTES); // write bytes for file diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90CompoundFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90CompoundFormat.java index 506462f6d6b..8d6ee823c79 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90CompoundFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90CompoundFormat.java @@ -16,7 +16,17 @@ */ package org.apache.lucene.codecs.lucene90; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; import org.apache.lucene.tests.index.BaseCompoundFormatTestCase; import org.apache.lucene.tests.util.TestUtil; @@ -27,4 +37,60 @@ public class TestLucene90CompoundFormat extends BaseCompoundFormatTestCase { protected Codec getCodec() { return codec; } + + public void testFileLengthOrdering() throws IOException { + Directory dir = newDirectory(); + // Setup the test segment + String segment = "_123"; + int chunk = 1024; // internal buffer size used by the stream + SegmentInfo si = newSegmentInfo(dir, segment); + byte[] segId = si.getId(); + List orderedFiles = new ArrayList<>(); + int randomFileSize = random().nextInt(0, chunk); + for (int i = 0; i < 10; i++) { + String filename = segment + "." + i; + createRandomFile(dir, filename, randomFileSize, segId); + // increase the next files size by a random amount + randomFileSize += random().nextInt(1, 100); + orderedFiles.add(filename); + } + List shuffledFiles = new ArrayList<>(orderedFiles); + Collections.shuffle(shuffledFiles, random()); + si.setFiles(shuffledFiles); + si.getCodec().compoundFormat().write(dir, si, IOContext.DEFAULT); + + // entries file should contain files ordered by their size + String entriesFileName = + IndexFileNames.segmentFileName(si.name, "", Lucene90CompoundFormat.ENTRIES_EXTENSION); + try (ChecksumIndexInput entriesStream = dir.openChecksumInput(entriesFileName)) { + Throwable priorE = null; + try { + CodecUtil.checkIndexHeader( + entriesStream, + Lucene90CompoundFormat.ENTRY_CODEC, + Lucene90CompoundFormat.VERSION_START, + Lucene90CompoundFormat.VERSION_CURRENT, + si.getId(), + ""); + final int numEntries = entriesStream.readVInt(); + long lastOffset = 0; + long lastLength = 0; + for (int i = 0; i < numEntries; i++) { + final String id = entriesStream.readString(); + assertEquals(orderedFiles.get(i), segment + id); + long offset = entriesStream.readLong(); + assertTrue(offset > lastOffset); + lastOffset = offset; + long length = entriesStream.readLong(); + assertTrue(length >= lastLength); + lastLength = length; + } + } catch (Throwable exception) { + priorE = exception; + } finally { + CodecUtil.checkFooter(entriesStream, priorE); + } + } + dir.close(); + } }