LUCENE-10020 DocComparator don't skip docs of same docID (#204)

DocComparator should not skip docs with the same docID on multiple sorts with search after. Because of the optimization introduced in LUCENE-9449, currently when searching with sort on [_doc, other fields] with search after, DocComparator can efficiently skip all docs before and including the provided [search after docID]. This is a desirable behaviour in a single index search. But in a distributed search, where multiple indices have docs with the same docID, and when searching on [_doc, other fields], the sort optimization should NOT skip documents with the same docIDs. This PR fixes this. Relates to LUCENE-9449
2021-07-06 14:59:57 -04:00 · 2021-07-06 14:59:57 -04:00 · 64d9f8c587
parent 167bd99c23
commit 64d9f8c587
3 changed files with 72 additions and 2 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -383,6 +383,9 @@ Bug Fixes
 * LUCENE-9964: Duplicate long values in a document field should only be counted once when using SortedNumericDocValuesFields
  (Gautam Worah)
 * LUCENE-10020: DocComparator should not skip docs with the same docID on
  multiple sorts with search after (Mayya Sharipova, Julie Tibshirani)
 Other
 ---------------------
 (No changes)
--- a/lucene/core/src/java/org/apache/lucene/search/comparators/DocComparator.java
+++ b/lucene/core/src/java/org/apache/lucene/search/comparators/DocComparator.java
@ -81,7 +81,11 @@ public class DocComparator extends FieldComparator<Integer> {
    public DocLeafComparator(LeafReaderContext context) {
      this.docBase = context.docBase;
      if (enableSkipping) {
-        this.minDoc = topValue + 1;
+        // Skip docs before topValue, but include docs starting with topValue.
        // Including topValue is necessary when doing sort on [_doc, other fields]
        // in a distributed search where there are docs from different indices
        // with the same docID.
        this.minDoc = topValue;
        this.maxDoc = context.reader().maxDoc();
        this.competitiveIterator = DocIdSetIterator.all(maxDoc);
      } else {
--- a/lucene/core/src/test/org/apache/lucene/search/TestFieldSortOptimizationSkipping.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestFieldSortOptimizationSkipping.java
@ -37,7 +37,7 @@ import org.apache.lucene.index.Term;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.LuceneTestCase;
-public class TestFieldSortOptimizationSkipping extends LuceneTestCase {
+public class TestSortOptimization extends LuceneTestCase {
  public void testLongSortOptimization() throws IOException {
@ -332,6 +332,69 @@ public class TestFieldSortOptimizationSkipping extends LuceneTestCase {
    dir.close();
  }
  /**
   * Test that a search with sort on [_doc, other fields] across multiple indices doesn't miss any
   * documents.
   */
  public void testDocSortOptimizationMultipleIndices() throws IOException {
    final int numIndices = 3;
    final int numDocsInIndex = atLeast(50);
    Directory[] dirs = new Directory[numIndices];
    IndexReader[] readers = new IndexReader[numIndices];
    for (int i = 0; i < numIndices; i++) {
      dirs[i] = newDirectory();
      try (IndexWriter writer = new IndexWriter(dirs[i], new IndexWriterConfig())) {
        for (int docID = 0; docID < numDocsInIndex; docID++) {
          final Document doc = new Document();
          doc.add(new NumericDocValuesField("my_field", docID * numIndices + i));
          writer.addDocument(doc);
        }
        writer.flush();
      }
      readers[i] = DirectoryReader.open(dirs[i]);
    }
    final int size = 7;
    final int totalHitsThreshold = 7;
    final Sort sort = new Sort(FIELD_DOC, new SortField("my_field", SortField.Type.LONG));
    TopFieldDocs[] topDocs = new TopFieldDocs[numIndices];
    int curNumHits;
    FieldDoc after = null;
    long collectedDocs = 0;
    long totalDocs = 0;
    int numHits = 0;
    do {
      for (int i = 0; i < numIndices; i++) {
        IndexSearcher searcher = newSearcher(readers[i]);
        final TopFieldCollector collector =
            TopFieldCollector.create(sort, size, after, totalHitsThreshold);
        searcher.search(new MatchAllDocsQuery(), collector);
        topDocs[i] = collector.topDocs();
        for (int docID = 0; docID < topDocs[i].scoreDocs.length; docID++) {
          topDocs[i].scoreDocs[docID].shardIndex = i;
        }
        collectedDocs += topDocs[i].totalHits.value;
        totalDocs += numDocsInIndex;
      }
      TopFieldDocs mergedTopDcs = TopDocs.merge(sort, size, topDocs);
      curNumHits = mergedTopDcs.scoreDocs.length;
      numHits += curNumHits;
      if (curNumHits > 0) {
        after = (FieldDoc) mergedTopDcs.scoreDocs[curNumHits - 1];
      }
    } while (curNumHits > 0);
    for (int i = 0; i < numIndices; i++) {
      readers[i].close();
      dirs[i].close();
    }
    final int expectedNumHits = numDocsInIndex * numIndices;
    assertEquals(expectedNumHits, numHits);
    // check that the optimization was run, as very few docs were collected
    assertTrue(collectedDocs < totalDocs);
  }
  public void testDocSortOptimizationWithAfter() throws IOException {
    final Directory dir = newDirectory();
    final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig());