Revert "Only consider clauses whose cost is less than the lead cost to compute block boundaries in WANDScorer. (#14000)"

This reverts commit 5807ff1620.
This commit is contained in:
Adrien Grand 2024-11-20 10:15:44 +01:00
parent a67120e175
commit b70d214217
4 changed files with 23 additions and 37 deletions

View File

@ -70,9 +70,6 @@ Optimizations
* GITHUB#13994: Speed up top-k retrieval of filtered conjunctions. * GITHUB#13994: Speed up top-k retrieval of filtered conjunctions.
(Adrien Grand) (Adrien Grand)
* GITHUB#13996, GITHUB#14000: Speed up top-k retrieval of filtered disjunctions.
(Adrien Grand)
Bug Fixes Bug Fixes
--------------------- ---------------------
* GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended * GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended

View File

@ -478,7 +478,7 @@ final class BooleanScorerSupplier extends ScorerSupplier {
// However, as WANDScorer uses more complex algorithm and data structure, we would like to // However, as WANDScorer uses more complex algorithm and data structure, we would like to
// still use DisjunctionSumScorer to handle exhaustive pure disjunctions, which may be faster // still use DisjunctionSumScorer to handle exhaustive pure disjunctions, which may be faster
if ((scoreMode == ScoreMode.TOP_SCORES && topLevelScoringClause) || minShouldMatch > 1) { if ((scoreMode == ScoreMode.TOP_SCORES && topLevelScoringClause) || minShouldMatch > 1) {
return new WANDScorer(optionalScorers, minShouldMatch, scoreMode, leadCost); return new WANDScorer(optionalScorers, minShouldMatch, scoreMode);
} else { } else {
return new DisjunctionSumScorer(optionalScorers, scoreMode); return new DisjunctionSumScorer(optionalScorers, scoreMode);
} }

View File

@ -149,9 +149,8 @@ final class WANDScorer extends Scorer {
int freq; int freq;
final ScoreMode scoreMode; final ScoreMode scoreMode;
final long leadCost;
WANDScorer(Collection<Scorer> scorers, int minShouldMatch, ScoreMode scoreMode, long leadCost) WANDScorer(Collection<Scorer> scorers, int minShouldMatch, ScoreMode scoreMode)
throws IOException { throws IOException {
if (minShouldMatch >= scorers.size()) { if (minShouldMatch >= scorers.size()) {
@ -203,7 +202,6 @@ final class WANDScorer extends Scorer {
scorers.stream().map(Scorer::iterator).mapToLong(DocIdSetIterator::cost), scorers.stream().map(Scorer::iterator).mapToLong(DocIdSetIterator::cost),
scorers.size(), scorers.size(),
minShouldMatch); minShouldMatch);
this.leadCost = leadCost;
} }
// returns a boolean so that it can be called from assert // returns a boolean so that it can be called from assert
@ -397,32 +395,26 @@ final class WANDScorer extends Scorer {
} }
private void updateMaxScores(int target) throws IOException { private void updateMaxScores(int target) throws IOException {
int newUpTo = DocIdSetIterator.NO_MORE_DOCS; if (head.size() == 0) {
// If we have entries in 'head', we treat them all as leads and take the minimum of their next // If the head is empty we use the greatest score contributor as a lead
// block boundaries as a next boundary. // like for conjunctions.
// We don't take entries in 'tail' into account on purpose: 'tail' is supposed to contain the upTo = tail[0].scorer.advanceShallow(target);
// least score contributors, and taking them into account might not move the boundary fast } else {
// enough, so we'll waste CPU re-computing the next boundary all the time. // If we still have entries in 'head', we treat them all as leads and
// Likewise, we ignore clauses whose cost is greater than the lead cost to avoid recomputing // take the minimum of their next block boundaries as a next boundary.
// per-window max scores over and over again. In the event when this makes us compute upTo as // We don't take entries in 'tail' into account on purpose: 'tail' is
// NO_MORE_DOCS, this scorer will effectively implement WAND rather than block-max WAND. // supposed to contain the least score contributors, and taking them
for (DisiWrapper w : head) { // into account might not move the boundary fast enough, so we'll waste
if (w.doc <= newUpTo && w.cost <= leadCost) { // CPU re-computing the next boundary all the time.
newUpTo = Math.min(w.scorer.advanceShallow(w.doc), newUpTo); int newUpTo = DocIdSetIterator.NO_MORE_DOCS;
w.scaledMaxScore = scaleMaxScore(w.scorer.getMaxScore(newUpTo), scalingFactor); for (DisiWrapper w : head) {
if (w.doc <= newUpTo) {
newUpTo = Math.min(w.scorer.advanceShallow(w.doc), newUpTo);
w.scaledMaxScore = scaleMaxScore(w.scorer.getMaxScore(newUpTo), scalingFactor);
}
} }
upTo = newUpTo;
} }
// Only look at the tail if none of the `head` clauses had a block we could reuse and if its
// cost is less than or equal to the lead cost.
if (newUpTo == DocIdSetIterator.NO_MORE_DOCS && tailSize > 0 && tail[0].cost <= leadCost) {
newUpTo = tail[0].scorer.advanceShallow(target);
// upTo must be on or after the least `head` doc
DisiWrapper headTop = head.top();
if (headTop != null) {
newUpTo = Math.max(newUpTo, headTop.doc);
}
}
upTo = newUpTo;
tailMaxScore = 0; tailMaxScore = 0;
for (int i = 0; i < tailSize; ++i) { for (int i = 0; i < tailSize; ++i) {
@ -468,7 +460,8 @@ final class WANDScorer extends Scorer {
} }
} }
assert head.size() == 0 || head.top().doc <= upTo; assert (head.size() == 0 && upTo == DocIdSetIterator.NO_MORE_DOCS)
|| (head.size() > 0 && head.top().doc <= upTo);
assert upTo >= target; assert upTo >= target;
} }

View File

@ -1024,11 +1024,7 @@ public class TestWANDScorer extends LuceneTestCase {
final Scorer scorer; final Scorer scorer;
if (optionalScorers.size() > 0) { if (optionalScorers.size() > 0) {
scorer = scorer =
new WANDScorer( new WANDScorer(optionalScorers, query.getMinimumNumberShouldMatch(), scoreMode);
optionalScorers,
query.getMinimumNumberShouldMatch(),
scoreMode,
Long.MAX_VALUE);
} else { } else {
scorer = weight.scorer(context); scorer = weight.scorer(context);
if (scorer == null) return null; if (scorer == null) return null;