Only consider clauses whose cost is less than the lead cost to compute block boundaries in WANDScorer. (#14003)

WANDScorer implements block-max WAND and needs to recompute score upper bounds whenever it moves to a different block. Thus it's important for these blocks to be large enough to avoid re-computing score upper bounds over and over again. With this commit, WANDScorer no longer uses clauses whose cost is higher than the cost of the filter to compute block boundaries. This effectively makes blocks larger when the filter is more selective.
2024-11-22 18:18:00 +01:00 · 2024-11-22 18:18:00 +01:00 · 90fee7e13f
parent 532d267228
commit 90fee7e13f
4 changed files with 69 additions and 37 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -99,6 +99,9 @@ Optimizations
 * GITHUB#13985: Introduces IndexInput#updateReadAdvice to change the ReadAdvice
  while merging vectors. (Tejas Shah)

+* GITHUB#14000: Speed up top-k retrieval of filtered disjunctions.
+  (Adrien Grand)
+
 Bug Fixes
 ---------------------
 * GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended
--- a/lucene/core/src/java/org/apache/lucene/search/BooleanScorerSupplier.java
+++ b/lucene/core/src/java/org/apache/lucene/search/BooleanScorerSupplier.java
@ -478,7 +478,7 @@ final class BooleanScorerSupplier extends ScorerSupplier {
      // However, as WANDScorer uses more complex algorithm and data structure, we would like to
      // still use DisjunctionSumScorer to handle exhaustive pure disjunctions, which may be faster
      if ((scoreMode == ScoreMode.TOP_SCORES && topLevelScoringClause) || minShouldMatch > 1) {
-        return new WANDScorer(optionalScorers, minShouldMatch, scoreMode);
+        return new WANDScorer(optionalScorers, minShouldMatch, scoreMode, leadCost);
      } else {
        return new DisjunctionSumScorer(optionalScorers, scoreMode);
      }
--- a/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java
+++ b/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java
@ -149,8 +149,9 @@ final class WANDScorer extends Scorer {
  int freq;

  final ScoreMode scoreMode;
+  final long leadCost;

-  WANDScorer(Collection<Scorer> scorers, int minShouldMatch, ScoreMode scoreMode)
+  WANDScorer(Collection<Scorer> scorers, int minShouldMatch, ScoreMode scoreMode, long leadCost)
      throws IOException {

    if (minShouldMatch >= scorers.size()) {
@ -202,6 +203,7 @@ final class WANDScorer extends Scorer {
            scorers.stream().map(Scorer::iterator).mapToLong(DocIdSetIterator::cost),
            scorers.size(),
            minShouldMatch);
+    this.leadCost = leadCost;
  }

  // returns a boolean so that it can be called from assert
@ -395,25 +397,37 @@ final class WANDScorer extends Scorer {
  }

  private void updateMaxScores(int target) throws IOException {
-    if (head.size() == 0) {
-      // If the head is empty we use the greatest score contributor as a lead
-      // like for conjunctions.
-      upTo = tail[0].scorer.advanceShallow(target);
-    } else {
-      // If we still have entries in 'head', we treat them all as leads and
-      // take the minimum of their next block boundaries as a next boundary.
-      // We don't take entries in 'tail' into account on purpose: 'tail' is
-      // supposed to contain the least score contributors, and taking them
-      // into account might not move the boundary fast enough, so we'll waste
-      // CPU re-computing the next boundary all the time.
-      int newUpTo = DocIdSetIterator.NO_MORE_DOCS;
-      for (DisiWrapper w : head) {
-        if (w.doc <= newUpTo) {
-          newUpTo = Math.min(w.scorer.advanceShallow(w.doc), newUpTo);
-          w.scaledMaxScore = scaleMaxScore(w.scorer.getMaxScore(newUpTo), scalingFactor);
-        }
+    int newUpTo = DocIdSetIterator.NO_MORE_DOCS;
+    // If we have entries in 'head', we treat them all as leads and take the minimum of their next
+    // block boundaries as a next boundary.
+    // We don't take entries in 'tail' into account on purpose: 'tail' is supposed to contain the
+    // least score contributors, and taking them into account might not move the boundary fast
+    // enough, so we'll waste CPU re-computing the next boundary all the time.
+    // Likewise, we ignore clauses whose cost is greater than the lead cost to avoid recomputing
+    // per-window max scores over and over again. In the event when this makes us compute upTo as
+    // NO_MORE_DOCS, this scorer will effectively implement WAND rather than block-max WAND.
+    for (DisiWrapper w : head) {
+      if (w.doc <= newUpTo && w.cost <= leadCost) {
+        newUpTo = Math.min(w.scorer.advanceShallow(w.doc), newUpTo);
+      }
+    }
+    // Only look at the tail if none of the `head` clauses had a block we could reuse and if its
+    // cost is less than or equal to the lead cost.
+    if (newUpTo == DocIdSetIterator.NO_MORE_DOCS && tailSize > 0 && tail[0].cost <= leadCost) {
+      newUpTo = tail[0].scorer.advanceShallow(target);
+      // upTo must be on or after the least `head` doc
+      DisiWrapper headTop = head.top();
+      if (headTop != null) {
+        newUpTo = Math.max(newUpTo, headTop.doc);
+      }
+    }
+    upTo = newUpTo;
+
+    // Now update the max scores of clauses that are before upTo.
+    for (DisiWrapper w : head) {
+      if (w.doc <= upTo) {
+        w.scaledMaxScore = scaleMaxScore(w.scorer.getMaxScore(newUpTo), scalingFactor);
      }
-      upTo = newUpTo;
    }

    tailMaxScore = 0;
@ -460,8 +474,7 @@ final class WANDScorer extends Scorer {
      }
    }

-    assert (head.size() == 0 && upTo == DocIdSetIterator.NO_MORE_DOCS)
-        || (head.size() > 0 && head.top().doc <= upTo);
+    assert head.size() == 0 || head.top().doc <= upTo;
    assert upTo >= target;
  }

--- a/lucene/core/src/test/org/apache/lucene/search/TestWANDScorer.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestWANDScorer.java
@ -119,7 +119,8 @@ public class TestWANDScorer extends LuceneTestCase {
                .add(
                    new BoostQuery(new ConstantScoreQuery(new TermQuery(new Term("foo", "C"))), 3),
                    Occur.SHOULD)
-                .build());
+                .build(),
+            random().nextBoolean());

    Weight weight = searcher.createWeight(searcher.rewrite(query), ScoreMode.TOP_SCORES, 1);
    ScorerSupplier ss = weight.scorerSupplier(searcher.getIndexReader().leaves().get(0));
@ -180,7 +181,8 @@ public class TestWANDScorer extends LuceneTestCase {
                        .add(
                            new ConstantScoreQuery(new TermQuery(new Term("foo", "B"))),
                            Occur.SHOULD)
-                        .build()),
+                        .build(),
+                    random().nextBoolean()),
                Occur.MUST)
            .add(new TermQuery(new Term("foo", "C")), Occur.FILTER)
            .build();
@ -222,7 +224,8 @@ public class TestWANDScorer extends LuceneTestCase {
                        .add(
                            new ConstantScoreQuery(new TermQuery(new Term("foo", "B"))),
                            Occur.SHOULD)
-                        .build()),
+                        .build(),
+                    random().nextBoolean()),
                Occur.MUST)
            .add(new TermQuery(new Term("foo", "C")), Occur.MUST_NOT)
            .build();
@ -297,7 +300,8 @@ public class TestWANDScorer extends LuceneTestCase {
                            new ConstantScoreQuery(new TermQuery(new Term("foo", "C"))), 3),
                        Occur.SHOULD)
                    .setMinimumNumberShouldMatch(2)
-                    .build());
+                    .build(),
+                random().nextBoolean());

        Weight weight = searcher.createWeight(searcher.rewrite(query), ScoreMode.TOP_SCORES, 1);
        ScorerSupplier ss = weight.scorerSupplier(searcher.getIndexReader().leaves().get(0));
@ -377,7 +381,8 @@ public class TestWANDScorer extends LuceneTestCase {
                    .add(new TermQuery(new Term("foo", "B")), Occur.SHOULD)
                    .add(new TermQuery(new Term("foo", "C")), Occur.SHOULD)
                    .setMinimumNumberShouldMatch(2)
-                    .build());
+                    .build(),
+                random().nextBoolean());

        Weight weight = searcher.createWeight(searcher.rewrite(query), ScoreMode.TOP_SCORES, 1);
        ScorerSupplier ss = weight.scorerSupplier(searcher.getIndexReader().leaves().get(0));
@ -431,7 +436,8 @@ public class TestWANDScorer extends LuceneTestCase {
                            new ConstantScoreQuery(new TermQuery(new Term("foo", "C"))), 3),
                        Occur.SHOULD)
                    .setMinimumNumberShouldMatch(2)
-                    .build());
+                    .build(),
+                random().nextBoolean());

        Scorer scorer =
            searcher
@ -489,7 +495,8 @@ public class TestWANDScorer extends LuceneTestCase {
                                    new ConstantScoreQuery(new TermQuery(new Term("foo", "D"))), 4),
                                Occur.SHOULD)
                            .setMinimumNumberShouldMatch(2)
-                            .build()),
+                            .build(),
+                        random().nextBoolean()),
                    Occur.MUST)
                .add(new TermQuery(new Term("foo", "C")), Occur.FILTER)
                .build();
@ -565,7 +572,8 @@ public class TestWANDScorer extends LuceneTestCase {
                                    new ConstantScoreQuery(new TermQuery(new Term("foo", "D"))), 4),
                                Occur.SHOULD)
                            .setMinimumNumberShouldMatch(2)
-                            .build()),
+                            .build(),
+                        random().nextBoolean()),
                    Occur.MUST)
                .add(new TermQuery(new Term("foo", "C")), Occur.FILTER)
                .build();
@ -625,7 +633,8 @@ public class TestWANDScorer extends LuceneTestCase {
                                    new ConstantScoreQuery(new TermQuery(new Term("foo", "D"))), 4),
                                Occur.SHOULD)
                            .setMinimumNumberShouldMatch(2)
-                            .build()),
+                            .build(),
+                        random().nextBoolean()),
                    Occur.MUST)
                .add(new TermQuery(new Term("foo", "C")), Occur.MUST_NOT)
                .build();
@ -699,7 +708,8 @@ public class TestWANDScorer extends LuceneTestCase {
                                    new ConstantScoreQuery(new TermQuery(new Term("foo", "D"))), 4),
                                Occur.SHOULD)
                            .setMinimumNumberShouldMatch(2)
-                            .build()),
+                            .build(),
+                        random().nextBoolean()),
                    Occur.MUST)
                .add(new TermQuery(new Term("foo", "C")), Occur.MUST_NOT)
                .build();
@ -744,7 +754,7 @@ public class TestWANDScorer extends LuceneTestCase {
        builder.add(
            maybeWrap(new TermQuery(new Term("foo", Integer.toString(start + i)))), Occur.SHOULD);
      }
-      Query query = new WANDScorerQuery(builder.build());
+      Query query = new WANDScorerQuery(builder.build(), random().nextBoolean());

      CheckHits.checkTopScores(random(), query, searcher);

@ -795,7 +805,7 @@ public class TestWANDScorer extends LuceneTestCase {
                    0f)),
            Occur.SHOULD);
      }
-      Query query = new WANDScorerQuery(builder.build());
+      Query query = new WANDScorerQuery(builder.build(), random().nextBoolean());

      CheckHits.checkTopScores(random(), query, searcher);

@ -855,7 +865,7 @@ public class TestWANDScorer extends LuceneTestCase {
        }
        builder.add(query, Occur.SHOULD);
      }
-      Query query = new WANDScorerQuery(builder.build());
+      Query query = new WANDScorerQuery(builder.build(), random().nextBoolean());

      CheckHits.checkTopScores(random(), query, searcher);

@ -979,11 +989,13 @@ public class TestWANDScorer extends LuceneTestCase {

  private static class WANDScorerQuery extends Query {
    private final BooleanQuery query;
+    private final boolean doBlocks;

-    private WANDScorerQuery(BooleanQuery query) {
+    private WANDScorerQuery(BooleanQuery query, boolean doBlocks) {
      assert query.clauses().size() == query.getClauses(Occur.SHOULD).size()
          : "This test utility query is only used to create WANDScorer for disjunctions.";
      this.query = query;
+      this.doBlocks = doBlocks;
    }

    @Override
@ -1024,7 +1036,11 @@ public class TestWANDScorer extends LuceneTestCase {
          final Scorer scorer;
          if (optionalScorers.size() > 0) {
            scorer =
-                new WANDScorer(optionalScorers, query.getMinimumNumberShouldMatch(), scoreMode);
+                new WANDScorer(
+                    optionalScorers,
+                    query.getMinimumNumberShouldMatch(),
+                    scoreMode,
+                    doBlocks ? Long.MAX_VALUE : 0L);
          } else {
            scorer = weight.scorer(context);
            if (scorer == null) return null;