Revert "Only consider clauses whose cost is less than the lead cost to compute block boundaries in WANDScorer. (#14000)"

This reverts commit 5807ff1620.
2024-11-20 10:15:44 +01:00 · 2024-11-20 10:15:44 +01:00 · b70d214217
parent a67120e175
commit b70d214217
4 changed files with 23 additions and 37 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -70,9 +70,6 @@ Optimizations
 * GITHUB#13994: Speed up top-k retrieval of filtered conjunctions.
  (Adrien Grand)
 * GITHUB#13996, GITHUB#14000: Speed up top-k retrieval of filtered disjunctions.
  (Adrien Grand)
 Bug Fixes
 ---------------------
 * GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended
--- a/lucene/core/src/java/org/apache/lucene/search/BooleanScorerSupplier.java
+++ b/lucene/core/src/java/org/apache/lucene/search/BooleanScorerSupplier.java
@ -478,7 +478,7 @@ final class BooleanScorerSupplier extends ScorerSupplier {
      // However, as WANDScorer uses more complex algorithm and data structure, we would like to
      // still use DisjunctionSumScorer to handle exhaustive pure disjunctions, which may be faster
      if ((scoreMode == ScoreMode.TOP_SCORES && topLevelScoringClause) || minShouldMatch > 1) {
-        return new WANDScorer(optionalScorers, minShouldMatch, scoreMode, leadCost);
+        return new WANDScorer(optionalScorers, minShouldMatch, scoreMode);
      } else {
        return new DisjunctionSumScorer(optionalScorers, scoreMode);
      }
--- a/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java
+++ b/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java
@ -149,9 +149,8 @@ final class WANDScorer extends Scorer {
  int freq;
  final ScoreMode scoreMode;
  final long leadCost;
-  WANDScorer(Collection<Scorer> scorers, int minShouldMatch, ScoreMode scoreMode, long leadCost)
+  WANDScorer(Collection<Scorer> scorers, int minShouldMatch, ScoreMode scoreMode)
      throws IOException {
    if (minShouldMatch >= scorers.size()) {
@ -203,7 +202,6 @@ final class WANDScorer extends Scorer {
            scorers.stream().map(Scorer::iterator).mapToLong(DocIdSetIterator::cost),
            scorers.size(),
            minShouldMatch);
    this.leadCost = leadCost;
  }
  // returns a boolean so that it can be called from assert
@ -397,32 +395,26 @@ final class WANDScorer extends Scorer {
  }
  private void updateMaxScores(int target) throws IOException {
-    int newUpTo = DocIdSetIterator.NO_MORE_DOCS;
+    if (head.size() == 0) {
-    // If we have entries in 'head', we treat them all as leads and take the minimum of their next
+      // If the head is empty we use the greatest score contributor as a lead
-    // block boundaries as a next boundary.
+      // like for conjunctions.
-    // We don't take entries in 'tail' into account on purpose: 'tail' is supposed to contain the
+      upTo = tail[0].scorer.advanceShallow(target);
-    // least score contributors, and taking them into account might not move the boundary fast
+    } else {
-    // enough, so we'll waste CPU re-computing the next boundary all the time.
+      // If we still have entries in 'head', we treat them all as leads and
-    // Likewise, we ignore clauses whose cost is greater than the lead cost to avoid recomputing
+      // take the minimum of their next block boundaries as a next boundary.
-    // per-window max scores over and over again. In the event when this makes us compute upTo as
+      // We don't take entries in 'tail' into account on purpose: 'tail' is
-    // NO_MORE_DOCS, this scorer will effectively implement WAND rather than block-max WAND.
+      // supposed to contain the least score contributors, and taking them
-    for (DisiWrapper w : head) {
+      // into account might not move the boundary fast enough, so we'll waste
-      if (w.doc <= newUpTo && w.cost <= leadCost) {
+      // CPU re-computing the next boundary all the time.
-        newUpTo = Math.min(w.scorer.advanceShallow(w.doc), newUpTo);
+      int newUpTo = DocIdSetIterator.NO_MORE_DOCS;
-        w.scaledMaxScore = scaleMaxScore(w.scorer.getMaxScore(newUpTo), scalingFactor);
+      for (DisiWrapper w : head) {
        if (w.doc <= newUpTo) {
          newUpTo = Math.min(w.scorer.advanceShallow(w.doc), newUpTo);
          w.scaledMaxScore = scaleMaxScore(w.scorer.getMaxScore(newUpTo), scalingFactor);
        }
      }
      upTo = newUpTo;
    }
    // Only look at the tail if none of the `head` clauses had a block we could reuse and if its
    // cost is less than or equal to the lead cost.
    if (newUpTo == DocIdSetIterator.NO_MORE_DOCS && tailSize > 0 && tail[0].cost <= leadCost) {
      newUpTo = tail[0].scorer.advanceShallow(target);
      // upTo must be on or after the least `head` doc
      DisiWrapper headTop = head.top();
      if (headTop != null) {
        newUpTo = Math.max(newUpTo, headTop.doc);
      }
    }
    upTo = newUpTo;
    tailMaxScore = 0;
    for (int i = 0; i < tailSize; ++i) {
@ -468,7 +460,8 @@ final class WANDScorer extends Scorer {
      }
    }
-    assert head.size() == 0 || head.top().doc <= upTo;
+    assert (head.size() == 0 && upTo == DocIdSetIterator.NO_MORE_DOCS)
        || (head.size() > 0 && head.top().doc <= upTo);
    assert upTo >= target;
  }
--- a/lucene/core/src/test/org/apache/lucene/search/TestWANDScorer.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestWANDScorer.java
@ -1024,11 +1024,7 @@ public class TestWANDScorer extends LuceneTestCase {
          final Scorer scorer;
          if (optionalScorers.size() > 0) {
            scorer =
-                new WANDScorer(
+                new WANDScorer(optionalScorers, query.getMinimumNumberShouldMatch(), scoreMode);
                    optionalScorers,
                    query.getMinimumNumberShouldMatch(),
                    scoreMode,
                    Long.MAX_VALUE);
          } else {
            scorer = weight.scorer(context);
            if (scorer == null) return null;