From b70d214217aa10429f8e4cc539c1e81fa92cfbd8 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Wed, 20 Nov 2024 10:15:44 +0100 Subject: [PATCH] Revert "Only consider clauses whose cost is less than the lead cost to compute block boundaries in WANDScorer. (#14000)" This reverts commit 5807ff1620014c9fa85d17b8518ee5e2b6e97162. --- lucene/CHANGES.txt | 3 -- .../lucene/search/BooleanScorerSupplier.java | 2 +- .../org/apache/lucene/search/WANDScorer.java | 49 ++++++++----------- .../apache/lucene/search/TestWANDScorer.java | 6 +-- 4 files changed, 23 insertions(+), 37 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index ce9fb078ec3..0e697a0dab3 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -70,9 +70,6 @@ Optimizations * GITHUB#13994: Speed up top-k retrieval of filtered conjunctions. (Adrien Grand) -* GITHUB#13996, GITHUB#14000: Speed up top-k retrieval of filtered disjunctions. - (Adrien Grand) - Bug Fixes --------------------- * GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended diff --git a/lucene/core/src/java/org/apache/lucene/search/BooleanScorerSupplier.java b/lucene/core/src/java/org/apache/lucene/search/BooleanScorerSupplier.java index 515d0a6bba1..7a53bc9a485 100644 --- a/lucene/core/src/java/org/apache/lucene/search/BooleanScorerSupplier.java +++ b/lucene/core/src/java/org/apache/lucene/search/BooleanScorerSupplier.java @@ -478,7 +478,7 @@ final class BooleanScorerSupplier extends ScorerSupplier { // However, as WANDScorer uses more complex algorithm and data structure, we would like to // still use DisjunctionSumScorer to handle exhaustive pure disjunctions, which may be faster if ((scoreMode == ScoreMode.TOP_SCORES && topLevelScoringClause) || minShouldMatch > 1) { - return new WANDScorer(optionalScorers, minShouldMatch, scoreMode, leadCost); + return new WANDScorer(optionalScorers, minShouldMatch, scoreMode); } else { return new DisjunctionSumScorer(optionalScorers, scoreMode); } diff --git a/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java b/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java index 730058b33ff..f910db30a26 100644 --- a/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java @@ -149,9 +149,8 @@ final class WANDScorer extends Scorer { int freq; final ScoreMode scoreMode; - final long leadCost; - WANDScorer(Collection scorers, int minShouldMatch, ScoreMode scoreMode, long leadCost) + WANDScorer(Collection scorers, int minShouldMatch, ScoreMode scoreMode) throws IOException { if (minShouldMatch >= scorers.size()) { @@ -203,7 +202,6 @@ final class WANDScorer extends Scorer { scorers.stream().map(Scorer::iterator).mapToLong(DocIdSetIterator::cost), scorers.size(), minShouldMatch); - this.leadCost = leadCost; } // returns a boolean so that it can be called from assert @@ -397,32 +395,26 @@ final class WANDScorer extends Scorer { } private void updateMaxScores(int target) throws IOException { - int newUpTo = DocIdSetIterator.NO_MORE_DOCS; - // If we have entries in 'head', we treat them all as leads and take the minimum of their next - // block boundaries as a next boundary. - // We don't take entries in 'tail' into account on purpose: 'tail' is supposed to contain the - // least score contributors, and taking them into account might not move the boundary fast - // enough, so we'll waste CPU re-computing the next boundary all the time. - // Likewise, we ignore clauses whose cost is greater than the lead cost to avoid recomputing - // per-window max scores over and over again. In the event when this makes us compute upTo as - // NO_MORE_DOCS, this scorer will effectively implement WAND rather than block-max WAND. - for (DisiWrapper w : head) { - if (w.doc <= newUpTo && w.cost <= leadCost) { - newUpTo = Math.min(w.scorer.advanceShallow(w.doc), newUpTo); - w.scaledMaxScore = scaleMaxScore(w.scorer.getMaxScore(newUpTo), scalingFactor); + if (head.size() == 0) { + // If the head is empty we use the greatest score contributor as a lead + // like for conjunctions. + upTo = tail[0].scorer.advanceShallow(target); + } else { + // If we still have entries in 'head', we treat them all as leads and + // take the minimum of their next block boundaries as a next boundary. + // We don't take entries in 'tail' into account on purpose: 'tail' is + // supposed to contain the least score contributors, and taking them + // into account might not move the boundary fast enough, so we'll waste + // CPU re-computing the next boundary all the time. + int newUpTo = DocIdSetIterator.NO_MORE_DOCS; + for (DisiWrapper w : head) { + if (w.doc <= newUpTo) { + newUpTo = Math.min(w.scorer.advanceShallow(w.doc), newUpTo); + w.scaledMaxScore = scaleMaxScore(w.scorer.getMaxScore(newUpTo), scalingFactor); + } } + upTo = newUpTo; } - // Only look at the tail if none of the `head` clauses had a block we could reuse and if its - // cost is less than or equal to the lead cost. - if (newUpTo == DocIdSetIterator.NO_MORE_DOCS && tailSize > 0 && tail[0].cost <= leadCost) { - newUpTo = tail[0].scorer.advanceShallow(target); - // upTo must be on or after the least `head` doc - DisiWrapper headTop = head.top(); - if (headTop != null) { - newUpTo = Math.max(newUpTo, headTop.doc); - } - } - upTo = newUpTo; tailMaxScore = 0; for (int i = 0; i < tailSize; ++i) { @@ -468,7 +460,8 @@ final class WANDScorer extends Scorer { } } - assert head.size() == 0 || head.top().doc <= upTo; + assert (head.size() == 0 && upTo == DocIdSetIterator.NO_MORE_DOCS) + || (head.size() > 0 && head.top().doc <= upTo); assert upTo >= target; } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestWANDScorer.java b/lucene/core/src/test/org/apache/lucene/search/TestWANDScorer.java index ea914f1acaa..678a0812ebc 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestWANDScorer.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestWANDScorer.java @@ -1024,11 +1024,7 @@ public class TestWANDScorer extends LuceneTestCase { final Scorer scorer; if (optionalScorers.size() > 0) { scorer = - new WANDScorer( - optionalScorers, - query.getMinimumNumberShouldMatch(), - scoreMode, - Long.MAX_VALUE); + new WANDScorer(optionalScorers, query.getMinimumNumberShouldMatch(), scoreMode); } else { scorer = weight.scorer(context); if (scorer == null) return null;