LUCENE-9507 Custom order for leaves in IndexReader and IndexWriter (#32)

1. Add an option to supply a custom leaf sorter for IndexWriter.
A DirectoryReader opened from this IndexWriter will have its leaf
readers sorted with the provided leaf sorter. This is useful for
indices on which it is expected to run many queries with particular
sort criteria (e.g. for time-based indices this is usually a
descending sort on timestamp). Providing leafSorter allows
to speed up early termination for this particular type of
sort queries.

2. Add an option to supply a custom sub-readers sorter for
BaseCompositeReader. In this case sub-readers will be sorted 
according to the the provided leafSorter.

3. Add an option to supply a custom leaf sorter for
StandardDirectoryReader. The leaf readers of this
StandardDirectoryReader will be sorted according to
the the provided leaf sorter.
This commit is contained in:
Mayya Sharipova 2021-03-26 09:56:02 -04:00 committed by GitHub
parent b174ef45c4
commit 48715fe898
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 268 additions and 24 deletions

View File

@ -16,6 +16,9 @@ New Features
* LUCENE-9659: SpanPayloadCheckQuery now supports inequalities. (Kevin Watters, Gus Heck)
* LUCENE-9507: Custom order for leaves in IndexReader and IndexWriter
(Mayya Sharipova, Mike McCandless, Jim Ferenczi)
System Requirements
* LUCENE-8738: Move to Java 11 as minimum Java version.

View File

@ -19,6 +19,7 @@ package org.apache.lucene.index;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
@ -46,6 +47,9 @@ import java.util.concurrent.atomic.AtomicInteger;
*/
public abstract class BaseCompositeReader<R extends IndexReader> extends CompositeReader {
private final R[] subReaders;
/** A comparator for sorting sub-readers */
protected final Comparator<R> subReadersSorter;
private final int[] starts; // 1st docno for each reader
private final int maxDoc;
private AtomicInteger numDocs = new AtomicInteger(-1); // computed lazily
@ -63,9 +67,15 @@ public abstract class BaseCompositeReader<R extends IndexReader> extends Composi
* #getSequentialSubReaders} and used to resolve the correct subreader for docID-based
* methods. <b>Please note:</b> This array is <b>not</b> cloned and not protected for
* modification, the subclass is responsible to do this.
* @param subReadersSorter a comparator for sorting sub readers. If not {@code null}, this
* comparator is used to sort sub readers, before using the for resolving doc IDs.
*/
protected BaseCompositeReader(R[] subReaders) throws IOException {
protected BaseCompositeReader(R[] subReaders, Comparator<R> subReadersSorter) throws IOException {
if (subReadersSorter != null) {
Arrays.sort(subReaders, subReadersSorter);
}
this.subReaders = subReaders;
this.subReadersSorter = subReadersSorter;
this.subReadersList = Collections.unmodifiableList(Arrays.asList(subReaders));
starts = new int[subReaders.length + 1]; // build starts array
long maxDoc = 0;

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import java.nio.file.NoSuchFileException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.search.SearcherManager; // javadocs
import org.apache.lucene.store.Directory;
@ -56,7 +57,24 @@ public abstract class DirectoryReader extends BaseCompositeReader<LeafReader> {
* @throws IOException if there is a low-level IO error
*/
public static DirectoryReader open(final Directory directory) throws IOException {
return StandardDirectoryReader.open(directory, null);
return StandardDirectoryReader.open(directory, null, null);
}
/**
* Returns a IndexReader for the the index in the given Directory
*
* @param directory the index directory
* @param leafSorter a comparator for sorting leaf readers. Providing leafSorter is useful for
* indices on which it is expected to run many queries with particular sort criteria (e.g. for
* time-based indices this is usually a descending sort on timestamp). In this case {@code
* leafSorter} should sort leaves according to this sort criteria. Providing leafSorter allows
* to speed up this particular type of sort queries by early terminating while iterating
* through segments and segments' documents.
* @throws IOException if there is a low-level IO error
*/
public static DirectoryReader open(final Directory directory, Comparator<LeafReader> leafSorter)
throws IOException {
return StandardDirectoryReader.open(directory, null, leafSorter);
}
/**
@ -101,7 +119,7 @@ public abstract class DirectoryReader extends BaseCompositeReader<LeafReader> {
* @throws IOException if there is a low-level IO error
*/
public static DirectoryReader open(final IndexCommit commit) throws IOException {
return StandardDirectoryReader.open(commit.getDirectory(), commit);
return StandardDirectoryReader.open(commit.getDirectory(), commit, null);
}
/**
@ -118,7 +136,8 @@ public abstract class DirectoryReader extends BaseCompositeReader<LeafReader> {
*/
public static DirectoryReader open(final IndexCommit commit, int minSupportedMajorVersion)
throws IOException {
return StandardDirectoryReader.open(commit.getDirectory(), minSupportedMajorVersion, commit);
return StandardDirectoryReader.open(
commit.getDirectory(), minSupportedMajorVersion, commit, null);
}
/**
@ -317,9 +336,13 @@ public abstract class DirectoryReader extends BaseCompositeReader<LeafReader> {
* methods. <b>Please note:</b> This array is <b>not</b> cloned and not protected for
* modification outside of this reader. Subclasses of {@code DirectoryReader} should take care
* to not allow modification of this internal array, e.g. {@link #doOpenIfChanged()}.
* @param leafSorter a comparator for sorting leaf readers. If not {@code null}, this comparator
* is used for sorting leaf readers.
*/
protected DirectoryReader(Directory directory, LeafReader[] segmentReaders) throws IOException {
super(segmentReaders);
protected DirectoryReader(
Directory directory, LeafReader[] segmentReaders, Comparator<LeafReader> leafSorter)
throws IOException {
super(segmentReaders, leafSorter);
this.directory = directory;
}

View File

@ -88,7 +88,7 @@ public abstract class FilterDirectoryReader extends DirectoryReader {
* @param wrapper the SubReaderWrapper to use to wrap subreaders
*/
public FilterDirectoryReader(DirectoryReader in, SubReaderWrapper wrapper) throws IOException {
super(in.directory(), wrapper.wrap(in.getSequentialSubReaders()));
super(in.directory(), wrapper.wrap(in.getSequentialSubReaders()), null);
this.in = in;
}

View File

@ -18,6 +18,7 @@ package org.apache.lucene.index;
import java.io.PrintStream;
import java.util.Arrays;
import java.util.Comparator;
import java.util.stream.Collectors;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
@ -478,6 +479,18 @@ public final class IndexWriterConfig extends LiveIndexWriterConfig {
return this;
}
/**
* Set the comparator for sorting leaf readers. A DirectoryReader opened from a IndexWriter with
* this configuration will have its leaf readers sorted with the provided leaf sorter.
*
* @param leafSorter a comparator for sorting leaf readers
* @return IndexWriterConfig with leafSorter set.
*/
public IndexWriterConfig setLeafSorter(Comparator<LeafReader> leafSorter) {
this.leafSorter = leafSorter;
return this;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder(super.toString());

View File

@ -17,6 +17,7 @@
package org.apache.lucene.index;
import java.util.Collections;
import java.util.Comparator;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.codecs.Codec;
@ -91,6 +92,9 @@ public class LiveIndexWriterConfig {
/** The sort order to use to write merged segments. */
protected Sort indexSort = null;
/** The comparator for sorting leaf readers. */
protected Comparator<LeafReader> leafSorter;
/** The field names involved in the index sort */
protected Set<String> indexSortFields = Collections.emptySet();
@ -393,6 +397,17 @@ public class LiveIndexWriterConfig {
return indexSortFields;
}
/**
* Returns a comparator for sorting leaf readers. If not {@code null}, this comparator is used to
* sort leaf readers within {@code DirectoryReader} opened from the {@code IndexWriter} of this
* configuration.
*
* @return a comparator for sorting leaf readers
*/
public Comparator<LeafReader> getLeafSorter() {
return leafSorter;
}
/**
* Expert: Returns if indexing threads check for pending flushes on update in order to help our
* flushing indexing buffers to disk
@ -467,6 +482,7 @@ public class LiveIndexWriterConfig {
sb.append("checkPendingFlushOnUpdate=").append(isCheckPendingFlushOnUpdate()).append("\n");
sb.append("softDeletesField=").append(getSoftDeletesField()).append("\n");
sb.append("maxFullFlushMergeWaitMillis=").append(getMaxFullFlushMergeWaitMillis()).append("\n");
sb.append("leafSorter=").append(getLeafSorter()).append("\n");
sb.append("eventListener=").append(getIndexWriterEventListener()).append("\n");
return sb.toString();
}

View File

@ -17,6 +17,7 @@
package org.apache.lucene.index;
import java.io.IOException;
import java.util.Comparator;
/**
* A {@link CompositeReader} which reads multiple indexes, appending their content. It can be used
@ -46,7 +47,7 @@ public class MultiReader extends BaseCompositeReader<IndexReader> {
* @param subReaders set of (sub)readers
*/
public MultiReader(IndexReader... subReaders) throws IOException {
this(subReaders, true);
this(subReaders, null, true);
}
/**
@ -57,7 +58,22 @@ public class MultiReader extends BaseCompositeReader<IndexReader> {
* is closed
*/
public MultiReader(IndexReader[] subReaders, boolean closeSubReaders) throws IOException {
super(subReaders.clone());
this(subReaders, null, closeSubReaders);
}
/**
* Construct a MultiReader aggregating the named set of (sub)readers.
*
* @param subReaders set of (sub)readers; this array will be cloned.
* @param subReadersSorter a comparator, that if not {@code null} is used for sorting sub
* readers.
* @param closeSubReaders indicates whether the subreaders should be closed when this MultiReader
* is closed
*/
public MultiReader(
IndexReader[] subReaders, Comparator<IndexReader> subReadersSorter, boolean closeSubReaders)
throws IOException {
super(subReaders.clone(), subReadersSorter);
this.closeSubReaders = closeSubReaders;
if (!closeSubReaders) {
for (int i = 0; i < subReaders.length; i++) {

View File

@ -69,7 +69,7 @@ public class ParallelCompositeReader extends BaseCompositeReader<LeafReader> {
public ParallelCompositeReader(
boolean closeSubReaders, CompositeReader[] readers, CompositeReader[] storedFieldReaders)
throws IOException {
super(prepareLeafReaders(readers, storedFieldReaders));
super(prepareLeafReaders(readers, storedFieldReaders), null);
this.closeSubReaders = closeSubReaders;
Collections.addAll(completeReaderSet, readers);
Collections.addAll(completeReaderSet, storedFieldReaders);

View File

@ -22,6 +22,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -42,30 +43,35 @@ public final class StandardDirectoryReader extends DirectoryReader {
private final boolean applyAllDeletes;
private final boolean writeAllDeletes;
/** called only from static open() methods */
/** package private constructor, called only from static open() methods. */
StandardDirectoryReader(
Directory directory,
LeafReader[] readers,
IndexWriter writer,
SegmentInfos sis,
Comparator<LeafReader> leafSorter,
boolean applyAllDeletes,
boolean writeAllDeletes)
throws IOException {
super(directory, readers);
super(directory, readers, leafSorter);
this.writer = writer;
this.segmentInfos = sis;
this.applyAllDeletes = applyAllDeletes;
this.writeAllDeletes = writeAllDeletes;
}
static DirectoryReader open(final Directory directory, final IndexCommit commit)
static DirectoryReader open(
final Directory directory, final IndexCommit commit, Comparator<LeafReader> leafSorter)
throws IOException {
return open(directory, Version.MIN_SUPPORTED_MAJOR, commit);
return open(directory, Version.MIN_SUPPORTED_MAJOR, commit, leafSorter);
}
/** called from DirectoryReader.open(...) methods */
static DirectoryReader open(
final Directory directory, int minSupportedMajorVersion, final IndexCommit commit)
final Directory directory,
int minSupportedMajorVersion,
final IndexCommit commit,
Comparator<LeafReader> leafSorter)
throws IOException {
return new SegmentInfos.FindSegmentsFile<DirectoryReader>(directory) {
@Override
@ -86,11 +92,10 @@ public final class StandardDirectoryReader extends DirectoryReader {
readers[i] =
new SegmentReader(sis.info(i), sis.getIndexCreatedVersionMajor(), IOContext.READ);
}
// This may throw CorruptIndexException if there are too many docs, so
// it must be inside try clause so we close readers in that case:
DirectoryReader reader =
new StandardDirectoryReader(directory, readers, null, sis, false, false);
new StandardDirectoryReader(directory, readers, null, sis, leafSorter, false, false);
success = true;
return reader;
@ -149,6 +154,7 @@ public final class StandardDirectoryReader extends DirectoryReader {
readers.toArray(new SegmentReader[readers.size()]),
writer,
segmentInfos,
writer.getConfig().getLeafSorter(),
applyAllDeletes,
writeAllDeletes);
return result;
@ -169,7 +175,10 @@ public final class StandardDirectoryReader extends DirectoryReader {
* @lucene.internal
*/
public static DirectoryReader open(
Directory directory, SegmentInfos infos, List<? extends LeafReader> oldReaders)
Directory directory,
SegmentInfos infos,
List<? extends LeafReader> oldReaders,
Comparator<LeafReader> leafSorter)
throws IOException {
// we put the old SegmentReaders in a map, that allows us
@ -291,7 +300,8 @@ public final class StandardDirectoryReader extends DirectoryReader {
}
}
}
return new StandardDirectoryReader(directory, newReaders, null, infos, false, false);
return new StandardDirectoryReader(
directory, newReaders, null, infos, leafSorter, false, false);
}
// TODO: move somewhere shared if it's useful elsewhere
@ -406,7 +416,8 @@ public final class StandardDirectoryReader extends DirectoryReader {
}
DirectoryReader doOpenIfChanged(SegmentInfos infos) throws IOException {
return StandardDirectoryReader.open(directory, infos, getSequentialSubReaders());
return StandardDirectoryReader.open(
directory, infos, getSequentialSubReaders(), subReadersSorter);
}
@Override

View File

@ -16,10 +16,15 @@
*/
package org.apache.lucene.index;
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomBoolean;
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomLongBetween;
import static java.util.stream.Collectors.toList;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Random;
import java.util.concurrent.atomic.AtomicBoolean;
@ -27,6 +32,7 @@ import java.util.concurrent.atomic.AtomicInteger;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
@ -1199,4 +1205,150 @@ public class TestIndexWriterReader extends LuceneTestCase {
w.close();
dir.close();
}
public void testIndexReaderWriterWithLeafSorter() throws IOException {
final String FIELD_NAME = "field1";
final boolean ASC_SORT = randomBoolean();
final long MISSING_VALUE =
ASC_SORT ? Long.MAX_VALUE : Long.MIN_VALUE; // missing values at the end
// create a comparator that sort leaf readers according with
// the min value (asc sort) or max value (desc sort) of its points
Comparator<LeafReader> leafSorter =
Comparator.comparingLong(
r -> {
try {
PointValues points = r.getPointValues(FIELD_NAME);
if (points != null) {
byte[] sortValue =
ASC_SORT ? points.getMinPackedValue() : points.getMaxPackedValue();
return LongPoint.decodeDimension(sortValue, 0);
}
} catch (IOException e) {
}
return MISSING_VALUE;
});
if (ASC_SORT == false) {
leafSorter = leafSorter.reversed();
}
final int NUM_DOCS = atLeast(30);
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig();
iwc.setLeafSorter(leafSorter);
IndexWriter writer = new IndexWriter(dir, iwc);
for (int i = 0; i < NUM_DOCS; ++i) {
final Document doc = new Document();
doc.add(new LongPoint(FIELD_NAME, randomLongBetween(1, 99)));
writer.addDocument(doc);
if (i > 0 && i % 10 == 0) writer.flush();
}
// Test1: test that leafReaders are sorted according to leafSorter provided in IndexWriterConfig
{
try (DirectoryReader reader = writer.getReader()) {
List<LeafReader> lrs =
reader.leaves().stream().map(LeafReaderContext::reader).collect(toList());
List<LeafReader> expectedSortedlrs =
reader.leaves().stream()
.map(LeafReaderContext::reader)
.sorted(leafSorter)
.collect(toList());
assertEquals(expectedSortedlrs, lrs);
// add more documents that should be sorted first
final long FIRST_VALUE = ASC_SORT ? 0 : 100;
for (int i = 0; i < 10; ++i) {
final Document doc = new Document();
doc.add(new LongPoint(FIELD_NAME, FIRST_VALUE));
writer.addDocument(doc);
}
writer.commit();
// and open again
try (DirectoryReader reader2 = DirectoryReader.openIfChanged(reader)) {
lrs = reader2.leaves().stream().map(LeafReaderContext::reader).collect(toList());
expectedSortedlrs =
reader2.leaves().stream()
.map(LeafReaderContext::reader)
.sorted(leafSorter)
.collect(toList());
assertEquals(expectedSortedlrs, lrs);
}
}
}
// Test2: test that leafReaders are sorted according to leafSorter provided in DirectoryReader
{
try (DirectoryReader reader = DirectoryReader.open(dir, leafSorter)) {
List<LeafReader> lrs =
reader.leaves().stream().map(LeafReaderContext::reader).collect(toList());
List<LeafReader> expectedSortedlrs =
reader.leaves().stream()
.map(LeafReaderContext::reader)
.sorted(leafSorter)
.collect(toList());
assertEquals(expectedSortedlrs, lrs);
// add more documents that should be sorted first
final long FIRST_VALUE = ASC_SORT ? 0 : 100;
for (int i = 0; i < 10; ++i) {
final Document doc = new Document();
doc.add(new LongPoint(FIELD_NAME, FIRST_VALUE));
writer.addDocument(doc);
}
writer.commit();
// and open again
try (DirectoryReader reader2 = DirectoryReader.openIfChanged(reader)) {
lrs = reader2.leaves().stream().map(LeafReaderContext::reader).collect(toList());
expectedSortedlrs =
reader2.leaves().stream()
.map(LeafReaderContext::reader)
.sorted(leafSorter)
.collect(toList());
assertEquals(expectedSortedlrs, lrs);
}
}
}
// Test3: test that FilterDirectoryReader sorts leaves according
// to leafSorter of its wrapped reader
{
try (DirectoryReader reader =
new AssertingDirectoryReader(DirectoryReader.open(dir, leafSorter))) {
List<LeafReader> lrs =
reader.leaves().stream().map(LeafReaderContext::reader).collect(toList());
List<LeafReader> expectedSortedlrs =
reader.leaves().stream()
.map(LeafReaderContext::reader)
.sorted(leafSorter)
.collect(toList());
assertEquals(expectedSortedlrs, lrs);
// add more documents that should be sorted first
final long FIRST_VALUE = ASC_SORT ? 0 : 100;
for (int i = 0; i < 10; ++i) {
final Document doc = new Document();
doc.add(new LongPoint(FIELD_NAME, FIRST_VALUE));
writer.addDocument(doc);
}
writer.commit();
// and open again
try (DirectoryReader reader2 = DirectoryReader.openIfChanged(reader)) {
lrs = reader2.leaves().stream().map(LeafReaderContext::reader).collect(toList());
expectedSortedlrs =
reader2.leaves().stream()
.map(LeafReaderContext::reader)
.sorted(leafSorter)
.collect(toList());
assertEquals(expectedSortedlrs, lrs);
}
}
}
writer.close();
dir.close();
}
}

View File

@ -1116,7 +1116,7 @@ public class UnifiedHighlighter {
.map(LeafReaderContext::reader)
.map(TermVectorReusingLeafReader::new)
.toArray(LeafReader[]::new);
return new BaseCompositeReader<IndexReader>(leafReaders) {
return new BaseCompositeReader<IndexReader>(leafReaders, null) {
@Override
protected void doClose() { // don't close the underlying reader
}

View File

@ -187,7 +187,7 @@ public class MultiPassIndexSplitter {
extends BaseCompositeReader<FakeDeleteLeafIndexReader> {
public FakeDeleteIndexReader(IndexReader reader) throws IOException {
super(initSubReaders(reader));
super(initSubReaders(reader), null);
}
private static FakeDeleteLeafIndexReader[] initSubReaders(IndexReader reader)

View File

@ -60,7 +60,7 @@ class SegmentInfosSearcherManager extends ReferenceManager<IndexSearcher> {
node.message("SegmentInfosSearcherManager.init: use incoming infos=" + infosIn.toString());
current =
SearcherManager.getSearcher(
searcherFactory, StandardDirectoryReader.open(dir, currentInfos, null), null);
searcherFactory, StandardDirectoryReader.open(dir, currentInfos, null, null), null);
addReaderClosedListener(current.getIndexReader());
}
@ -111,7 +111,7 @@ class SegmentInfosSearcherManager extends ReferenceManager<IndexSearcher> {
}
// Open a new reader, sharing any common segment readers with the old one:
DirectoryReader r = StandardDirectoryReader.open(dir, currentInfos, subs);
DirectoryReader r = StandardDirectoryReader.open(dir, currentInfos, subs, null);
addReaderClosedListener(r);
node.message("refreshed to version=" + currentInfos.getVersion() + " r=" + r);
return SearcherManager.getSearcher(searcherFactory, r, old.getIndexReader());