LUCENE-10106: Sort optimization wrongly skip first docs (#300)

The first documents of subsequent segments are mistakenly skipped when 
sort optimization is enabled. We should initialize maxDocVisited in
NumericComparator to -1 instead of 0.
This commit is contained in:
Nhat Nguyen 2021-09-15 09:21:59 -04:00 committed by GitHub
parent 1586933b18
commit b7a286dd69
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 85 additions and 1 deletions

View File

@ -84,7 +84,7 @@ public abstract class NumericComparator<T extends Number> extends FieldComparato
private DocIdSetIterator competitiveIterator;
private long iteratorCost;
private int maxDocVisited = 0;
private int maxDocVisited = -1;
private int updateCounter = 0;
public NumericLeafComparator(LeafReaderContext context) throws IOException {

View File

@ -20,6 +20,10 @@ import static org.apache.lucene.search.SortField.FIELD_DOC;
import static org.apache.lucene.search.SortField.FIELD_SCORE;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.LongStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FloatDocValuesField;
@ -633,4 +637,84 @@ public class TestSortOptimization extends LuceneTestCase {
reader.close();
dir.close();
}
public void testMaxDocVisited() throws IOException {
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig());
int numDocs = atLeast(10000);
long offset = 100 + random().nextInt(100);
long smallestValue = 50 + random().nextInt(50);
boolean flushed = false;
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
doc.add(new NumericDocValuesField("my_field", i + offset));
doc.add(new LongPoint("my_field", i + offset));
writer.addDocument(doc);
if (i >= 5000 && flushed == false) {
flushed = true;
writer.flush();
// Index the smallest value to the first slot of the second segment
doc = new Document();
doc.add(new NumericDocValuesField("my_field", smallestValue));
doc.add(new LongPoint("my_field", smallestValue));
writer.addDocument(doc);
}
}
IndexReader reader = DirectoryReader.open(writer);
writer.close();
IndexSearcher searcher = new IndexSearcher(reader);
SortField sortField = new SortField("my_field", SortField.Type.LONG);
TopFieldDocs topDocs =
searcher.search(new MatchAllDocsQuery(), 1 + random().nextInt(100), new Sort(sortField));
FieldDoc fieldDoc = (FieldDoc) topDocs.scoreDocs[0];
assertEquals(smallestValue, ((Long) fieldDoc.fields[0]).intValue());
reader.close();
dir.close();
}
public void testRandomLong() throws IOException {
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig());
List<Long> seqNos = LongStream.range(0, atLeast(10_000)).boxed().collect(Collectors.toList());
Collections.shuffle(seqNos, random());
int pendingDocs = 0;
for (long seqNo : seqNos) {
Document doc = new Document();
doc.add(new NumericDocValuesField("seq_no", seqNo));
doc.add(new LongPoint("seq_no", seqNo));
writer.addDocument(doc);
pendingDocs++;
if (pendingDocs > 500 && random().nextInt(100) <= 5) {
pendingDocs = 0;
writer.flush();
}
}
writer.flush();
seqNos.sort(Long::compare);
IndexReader reader = DirectoryReader.open(writer);
writer.close();
IndexSearcher searcher = new IndexSearcher(reader);
SortField sortField = new SortField("seq_no", SortField.Type.LONG);
int visitedHits = 0;
ScoreDoc after = null;
while (visitedHits < seqNos.size()) {
int batch = 1 + random().nextInt(100);
Query query =
random().nextBoolean()
? new MatchAllDocsQuery()
: LongPoint.newRangeQuery("seq_no", 0, Long.MAX_VALUE);
TopDocs topDocs = searcher.searchAfter(after, query, batch, new Sort(sortField));
int expectedHits = Math.min(seqNos.size() - visitedHits, batch);
assertEquals(expectedHits, topDocs.scoreDocs.length);
after = topDocs.scoreDocs[expectedHits - 1];
for (int i = 0; i < topDocs.scoreDocs.length; i++) {
FieldDoc fieldDoc = (FieldDoc) topDocs.scoreDocs[i];
long expectedSeqNo = seqNos.get(visitedHits);
assertEquals(expectedSeqNo, ((Long) fieldDoc.fields[0]).intValue());
visitedHits++;
}
}
reader.close();
dir.close();
}
}