LUCENE-10126: Fix competitiveIterator wrongly skip documents (#324)

The competitive iterator can wrongly skip a document that is advanced but not collected in the previous scoreRange.
2021-09-28 15:26:30 -04:00 · 2021-09-28 15:26:30 -04:00 · 5ab900e10b
parent 9f80b4d8fb
commit 5ab900e10b
5 changed files with 42 additions and 23 deletions
--- a/lucene/core/src/java/org/apache/lucene/search/Weight.java
+++ b/lucene/core/src/java/org/apache/lucene/search/Weight.java
@ -225,18 +225,23 @@ public abstract class Weight implements SegmentCacheable {
        throws IOException {
      collector.setScorer(scorer);
      DocIdSetIterator scorerIterator = twoPhase == null ? iterator : twoPhase.approximation();
-      DocIdSetIterator collectorIterator = collector.competitiveIterator();
+      DocIdSetIterator competitiveIterator = collector.competitiveIterator();
      DocIdSetIterator filteredIterator;
-      if (collectorIterator == null) {
+      if (competitiveIterator == null) {
        filteredIterator = scorerIterator;
      } else {
+        // Wrap CompetitiveIterator and ScorerIterator start with (i.e., calling nextDoc()) the last
+        // visited docID because ConjunctionDISI might have advanced to it in the previous
+        // scoreRange, but we didn't process due to the range limit of scoreRange.
        if (scorerIterator.docID() != -1) {
-          // Wrap ScorerIterator to start from -1 for conjunction
          scorerIterator = new StartDISIWrapper(scorerIterator);
        }
+        if (competitiveIterator.docID() != -1) {
+          competitiveIterator = new StartDISIWrapper(competitiveIterator);
+        }
        // filter scorerIterator to keep only competitive docs as defined by collector
        filteredIterator =
-            ConjunctionUtils.intersectIterators(Arrays.asList(scorerIterator, collectorIterator));
+            ConjunctionUtils.intersectIterators(Arrays.asList(scorerIterator, competitiveIterator));
      }
      if (filteredIterator.docID() == -1 && min == 0 && max == DocIdSetIterator.NO_MORE_DOCS) {
        scoreAll(collector, filteredIterator, twoPhase, acceptDocs);
@ -314,15 +319,15 @@ public abstract class Weight implements SegmentCacheable {
    }
  }

-  /** Wraps an internal docIdSetIterator for it to start with docID = -1 */
-  protected static class StartDISIWrapper extends DocIdSetIterator {
+  /** Wraps an internal docIdSetIterator for it to start with the last visited docID */
+  private static class StartDISIWrapper extends DocIdSetIterator {
    private final DocIdSetIterator in;
-    private final int min;
+    private final int startDocID;
    private int docID = -1;

-    public StartDISIWrapper(DocIdSetIterator in) {
+    StartDISIWrapper(DocIdSetIterator in) {
      this.in = in;
-      this.min = in.docID();
+      this.startDocID = in.docID();
    }

    @Override
@ -337,8 +342,8 @@ public abstract class Weight implements SegmentCacheable {

    @Override
    public int advance(int target) throws IOException {
-      if (target <= min) {
-        return docID = min;
+      if (target <= startDocID) {
+        return docID = startDocID;
      }
      return docID = in.advance(target);
    }
--- a/lucene/core/src/java/org/apache/lucene/search/comparators/DocComparator.java
+++ b/lucene/core/src/java/org/apache/lucene/search/comparators/DocComparator.java
@ -131,7 +131,7 @@ public class DocComparator extends FieldComparator<Integer> {
        return null;
      } else {
        return new DocIdSetIterator() {
-          private int docID = -1;
+          private int docID = competitiveIterator.docID();

          @Override
          public int nextDoc() throws IOException {
--- a/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java
+++ b/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java
@ -273,7 +273,7 @@ public abstract class NumericComparator<T extends Number> extends FieldComparato
    public DocIdSetIterator competitiveIterator() {
      if (enableSkipping == false) return null;
      return new DocIdSetIterator() {
-        private int docID = -1;
+        private int docID = competitiveIterator.docID();

        @Override
        public int nextDoc() throws IOException {
--- a/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java
@ -58,7 +58,7 @@ public class TestSortOptimization extends LuceneTestCase {
    }
    final IndexReader reader = DirectoryReader.open(writer);
    writer.close();
-    IndexSearcher searcher = new IndexSearcher(reader);
+    IndexSearcher searcher = newSearcher(reader);
    final SortField sortField = new SortField("my_field", SortField.Type.LONG);
    final Sort sort = new Sort(sortField);
    final int numHits = 3;
@ -143,7 +143,7 @@ public class TestSortOptimization extends LuceneTestCase {
    }
    final IndexReader reader = DirectoryReader.open(writer);
    writer.close();
-    IndexSearcher searcher = new IndexSearcher(reader);
+    IndexSearcher searcher = newSearcher(reader);
    final SortField sortField = new SortField("my_field", SortField.Type.LONG);
    final Sort sort = new Sort(sortField);
    final int numHits = 3;
@ -182,7 +182,7 @@ public class TestSortOptimization extends LuceneTestCase {
    }
    final IndexReader reader = DirectoryReader.open(writer);
    writer.close();
-    IndexSearcher searcher = new IndexSearcher(reader);
+    IndexSearcher searcher = newSearcher(reader);
    final int numHits = 3;
    final int totalHitsThreshold = 3;

@ -220,7 +220,7 @@ public class TestSortOptimization extends LuceneTestCase {
  public void testSortOptimizationEqualValues() throws IOException {
    final Directory dir = newDirectory();
    final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig());
-    final int numDocs = atLeast(10000);
+    final int numDocs = atLeast(TEST_NIGHTLY ? 50_000 : 10_000);
    for (int i = 1; i <= numDocs; ++i) {
      final Document doc = new Document();
      doc.add(
@ -234,7 +234,7 @@ public class TestSortOptimization extends LuceneTestCase {
    }
    final IndexReader reader = DirectoryReader.open(writer);
    writer.close();
-    IndexSearcher searcher = new IndexSearcher(reader);
+    IndexSearcher searcher = newSearcher(reader);
    final int numHits = 3;
    final int totalHitsThreshold = 3;

@ -313,7 +313,7 @@ public class TestSortOptimization extends LuceneTestCase {
    }
    final IndexReader reader = DirectoryReader.open(writer);
    writer.close();
-    IndexSearcher searcher = new IndexSearcher(reader);
+    IndexSearcher searcher = newSearcher(reader);
    final SortField sortField = new SortField("my_field", SortField.Type.FLOAT);
    final Sort sort = new Sort(sortField);
    final int numHits = 3;
@ -661,7 +661,7 @@ public class TestSortOptimization extends LuceneTestCase {
    }
    IndexReader reader = DirectoryReader.open(writer);
    writer.close();
-    IndexSearcher searcher = new IndexSearcher(reader);
+    IndexSearcher searcher = newSearcher(reader);
    SortField sortField = new SortField("my_field", SortField.Type.LONG);
    TopFieldDocs topDocs =
        searcher.search(new MatchAllDocsQuery(), 1 + random().nextInt(100), new Sort(sortField));
@ -706,7 +706,7 @@ public class TestSortOptimization extends LuceneTestCase {
    seqNos.sort(Long::compare);
    IndexReader reader = DirectoryReader.open(writer);
    writer.close();
-    IndexSearcher searcher = new IndexSearcher(reader);
+    IndexSearcher searcher = newSearcher(reader);
    SortField sortField = new SortField("seq_no", SortField.Type.LONG);
    int visitedHits = 0;
    ScoreDoc after = null;
--- a/lucene/test-framework/src/java/org/apache/lucene/search/AssertingBulkScorer.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/search/AssertingBulkScorer.java
@ -80,8 +80,22 @@ final class AssertingBulkScorer extends BulkScorer {
    assert min <= max : "max must be greater than min, got min=" + min + ", and max=" + max;
    this.max = max;
    collector = new AssertingLeafCollector(collector, min, max);
-    final int next = in.score(collector, acceptDocs, min, max);
-    assert next >= max;
+    int next = min;
+    while (next < max) {
+      final int upTo;
+      if (random.nextBoolean()) {
+        upTo = max;
+      } else {
+        final int interval;
+        if (random.nextInt(100) <= 5) {
+          interval = 1 + random.nextInt(10);
+        } else {
+          interval = 1 + random.nextInt(random.nextBoolean() ? 100 : 5000);
+        }
+        upTo = Math.toIntExact(Math.min(min + interval, max));
+      }
+      next = in.score(collector, acceptDocs, min, upTo);
+    }
    if (max >= maxDoc || next >= maxDoc) {
      assert next == DocIdSetIterator.NO_MORE_DOCS;
      return DocIdSetIterator.NO_MORE_DOCS;