LUCENE-10106: Sort optimization wrongly skip first docs (#300)

The first documents of subsequent segments are mistakenly skipped when sort optimization is enabled. We should initialize maxDocVisited in NumericComparator to -1 instead of 0.
2021-09-15 09:21:59 -04:00 · 2021-09-15 09:21:59 -04:00 · b7a286dd69
parent 1586933b18
commit b7a286dd69
2 changed files with 85 additions and 1 deletions
--- a/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java
+++ b/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java
@ -84,7 +84,7 @@ public abstract class NumericComparator<T extends Number> extends FieldComparato
    private DocIdSetIterator competitiveIterator;
    private long iteratorCost;
-    private int maxDocVisited = 0;
+    private int maxDocVisited = -1;
    private int updateCounter = 0;
    public NumericLeafComparator(LeafReaderContext context) throws IOException {
--- a/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java
@ -20,6 +20,10 @@ import static org.apache.lucene.search.SortField.FIELD_DOC;
 import static org.apache.lucene.search.SortField.FIELD_SCORE;
 import java.io.IOException;
 import java.util.Collections;
 import java.util.List;
 import java.util.stream.Collectors;
 import java.util.stream.LongStream;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FloatDocValuesField;
@ -633,4 +637,84 @@ public class TestSortOptimization extends LuceneTestCase {
    reader.close();
    dir.close();
  }
  public void testMaxDocVisited() throws IOException {
    Directory dir = newDirectory();
    IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig());
    int numDocs = atLeast(10000);
    long offset = 100 + random().nextInt(100);
    long smallestValue = 50 + random().nextInt(50);
    boolean flushed = false;
    for (int i = 0; i < numDocs; ++i) {
      Document doc = new Document();
      doc.add(new NumericDocValuesField("my_field", i + offset));
      doc.add(new LongPoint("my_field", i + offset));
      writer.addDocument(doc);
      if (i >= 5000 && flushed == false) {
        flushed = true;
        writer.flush();
        // Index the smallest value to the first slot of the second segment
        doc = new Document();
        doc.add(new NumericDocValuesField("my_field", smallestValue));
        doc.add(new LongPoint("my_field", smallestValue));
        writer.addDocument(doc);
      }
    }
    IndexReader reader = DirectoryReader.open(writer);
    writer.close();
    IndexSearcher searcher = new IndexSearcher(reader);
    SortField sortField = new SortField("my_field", SortField.Type.LONG);
    TopFieldDocs topDocs =
        searcher.search(new MatchAllDocsQuery(), 1 + random().nextInt(100), new Sort(sortField));
    FieldDoc fieldDoc = (FieldDoc) topDocs.scoreDocs[0];
    assertEquals(smallestValue, ((Long) fieldDoc.fields[0]).intValue());
    reader.close();
    dir.close();
  }
  public void testRandomLong() throws IOException {
    Directory dir = newDirectory();
    IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig());
    List<Long> seqNos = LongStream.range(0, atLeast(10_000)).boxed().collect(Collectors.toList());
    Collections.shuffle(seqNos, random());
    int pendingDocs = 0;
    for (long seqNo : seqNos) {
      Document doc = new Document();
      doc.add(new NumericDocValuesField("seq_no", seqNo));
      doc.add(new LongPoint("seq_no", seqNo));
      writer.addDocument(doc);
      pendingDocs++;
      if (pendingDocs > 500 && random().nextInt(100) <= 5) {
        pendingDocs = 0;
        writer.flush();
      }
    }
    writer.flush();
    seqNos.sort(Long::compare);
    IndexReader reader = DirectoryReader.open(writer);
    writer.close();
    IndexSearcher searcher = new IndexSearcher(reader);
    SortField sortField = new SortField("seq_no", SortField.Type.LONG);
    int visitedHits = 0;
    ScoreDoc after = null;
    while (visitedHits < seqNos.size()) {
      int batch = 1 + random().nextInt(100);
      Query query =
          random().nextBoolean()
              ? new MatchAllDocsQuery()
              : LongPoint.newRangeQuery("seq_no", 0, Long.MAX_VALUE);
      TopDocs topDocs = searcher.searchAfter(after, query, batch, new Sort(sortField));
      int expectedHits = Math.min(seqNos.size() - visitedHits, batch);
      assertEquals(expectedHits, topDocs.scoreDocs.length);
      after = topDocs.scoreDocs[expectedHits - 1];
      for (int i = 0; i < topDocs.scoreDocs.length; i++) {
        FieldDoc fieldDoc = (FieldDoc) topDocs.scoreDocs[i];
        long expectedSeqNo = seqNos.get(visitedHits);
        assertEquals(expectedSeqNo, ((Long) fieldDoc.fields[0]).intValue());
        visitedHits++;
      }
    }
    reader.close();
    dir.close();
  }
 }