From 555c7d2ef2d49f1a7ceb47d8e717a3ea6dbe58c9 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 22 Oct 2024 16:52:36 +0200 Subject: [PATCH] Introduce a heuristic to amortize the per-window overhead in MaxScoreBulkScorer. (#13941) It is sometimes possible for `MaxScoreBulkScorer` to compute windows that don't contain many candidate matches, resulting in more time spent evaluating maximum scores per window than evaluating candidate matches on this window. This PR introduces a heuristic that tries to require at least 32 candidate matches per clause per window to amortize the per-window overhead. This results in a speedup for the `OrMany` task. --- lucene/CHANGES.txt | 3 ++ .../lucene/search/MaxScoreBulkScorer.java | 30 +++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 0730020cca5..1358b3c588c 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -34,6 +34,9 @@ Optimizations speeds up exhaustive evaluation of disjunctions of term queries. (Adrien Grand) +* GITHUB#13941: Optimized computation of top-hits on disjunctive queries with + many clauses. (Adrien Grand) + Bug Fixes --------------------- * GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended diff --git a/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java b/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java index 18f5b83e93a..56857bc67cc 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java @@ -66,6 +66,15 @@ final class MaxScoreBulkScorer extends BulkScorer { maxScoreSums = new double[allScorers.length]; } + // Number of outer windows that have been evaluated + private int numOuterWindows; + // Number of candidate matches so far + private int numCandidates; + // Minimum window size. See #computeOuterWindowMax where we have heuristics that adjust the + // minimum window size based on the average number of candidate matches per outer window, to keep + // the per-window overhead under control. + private int minWindowSize = 1; + @Override public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException { collector.setScorer(scorable); @@ -124,6 +133,7 @@ final class MaxScoreBulkScorer extends BulkScorer { } outerWindowMin = Math.min(top.doc, outerWindowMax); + ++numOuterWindows; } return nextCandidate(max); @@ -278,6 +288,23 @@ final class MaxScoreBulkScorer extends BulkScorer { windowMax = (int) Math.min(windowMax, upTo + 1L); // upTo is inclusive } + if (allScorers.length - firstWindowLead > 1) { + // The more clauses we consider to compute outer windows, the higher chances that one of these + // clauses has a block boundary in the next few doc IDs. This situation can result in more + // time spent computing maximum scores per outer window than evaluating hits. To avoid such + // situations, we target at least 32 candidate matches per clause per outer window on average, + // to make sure we amortize the cost of computing maximum scores. + long threshold = numOuterWindows * 32L * allScorers.length; + if (numCandidates < threshold) { + minWindowSize = Math.min(minWindowSize << 1, INNER_WINDOW_SIZE); + } else { + minWindowSize = 1; + } + + int minWindowMax = (int) Math.min(Integer.MAX_VALUE, (long) windowMin + minWindowSize); + windowMax = Math.max(windowMax, minWindowMax); + } + return windowMax; } @@ -300,6 +327,9 @@ final class MaxScoreBulkScorer extends BulkScorer { private void scoreNonEssentialClauses( LeafCollector collector, int doc, double essentialScore, int numNonEssentialClauses) throws IOException { + + ++numCandidates; + double score = essentialScore; for (int i = numNonEssentialClauses - 1; i >= 0; --i) { float maxPossibleScore =