From 4aeecdfebf1443c9e5516bb6605257865756a47d Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 19 Nov 2024 09:53:45 +0100 Subject: [PATCH] Speed up top-k retrieval of filtered disjunctions a bit. (#13996) This moves work from `advance(int target)` to `TwoPhaseIterator#matches()` so that we do less work on hits that do not match the filter. --- lucene/CHANGES.txt | 2 +- .../org/apache/lucene/search/WANDScorer.java | 70 +++++++------------ 2 files changed, 26 insertions(+), 46 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index b854b5badaf..982e22374f0 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -96,7 +96,7 @@ Optimizations * GITHUB#13994: Speed up top-k retrieval of filtered conjunctions. (Adrien Grand) -* GITHUB#14000: Speed up top-k retrieval of filtered disjunctions. +* GITHUB#13996, GITHUB#14000: Speed up top-k retrieval of filtered disjunctions. (Adrien Grand) Bug Fixes diff --git a/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java b/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java index 1a1ae90203e..730058b33ff 100644 --- a/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java @@ -231,7 +231,11 @@ final class WANDScorer extends Scorer { } for (DisiWrapper w : head) { - assert w.doc > doc; + if (lead == null) { // After calling advance() but before matches() + assert w.doc >= doc; + } else { + assert w.doc > doc; + } } return true; @@ -286,20 +290,21 @@ final class WANDScorer extends Scorer { // Move 'lead' iterators back to the tail pushBackLeads(target); - // Advance 'head' as well - advanceHead(target); + // Make sure `head` is also on or beyond `target` + DisiWrapper headTop = advanceHead(target); - // Pop the new 'lead' from 'head' - moveToNextCandidate(target); - - if (doc == DocIdSetIterator.NO_MORE_DOCS) { - return DocIdSetIterator.NO_MORE_DOCS; + if (scoreMode == ScoreMode.TOP_SCORES && (headTop == null || headTop.doc > upTo)) { + // Update score bounds if necessary + moveToNextBlock(target); + assert upTo >= target; + headTop = head.top(); } - assert ensureConsistent(); - - // Advance to the next possible match - return doNextCompetitiveCandidate(); + if (headTop == null) { + return doc = DocIdSetIterator.NO_MORE_DOCS; + } else { + return doc = headTop.doc; + } } @Override @@ -311,6 +316,9 @@ final class WANDScorer extends Scorer { @Override public boolean matches() throws IOException { + assert lead == null; + moveToNextCandidate(); + while (leadMaxScore < minCompetitiveScore || freq < minShouldMatch) { if (leadMaxScore + tailMaxScore < minCompetitiveScore || freq + tailSize < minShouldMatch) { @@ -355,7 +363,7 @@ final class WANDScorer extends Scorer { } /** Make sure all disis in 'head' are on or after 'target'. */ - private void advanceHead(int target) throws IOException { + private DisiWrapper advanceHead(int target) throws IOException { DisiWrapper headTop = head.top(); while (headTop != null && headTop.doc < target) { final DisiWrapper evicted = insertTailWithOverFlow(headTop); @@ -367,6 +375,7 @@ final class WANDScorer extends Scorer { headTop = head.top(); } } + return headTop; } private void advanceTail(DisiWrapper disi) throws IOException { @@ -437,7 +446,7 @@ final class WANDScorer extends Scorer { * Update {@code upTo} and maximum scores of sub scorers so that {@code upTo} is greater than or * equal to the next candidate after {@code target}, i.e. the top of `head`. */ - private void updateMaxScoresIfNecessary(int target) throws IOException { + private void moveToNextBlock(int target) throws IOException { assert lead == null; while (upTo < DocIdSetIterator.NO_MORE_DOCS) { @@ -467,48 +476,19 @@ final class WANDScorer extends Scorer { * Set 'doc' to the next potential match, and move all disis of 'head' that are on this doc into * 'lead'. */ - private void moveToNextCandidate(int target) throws IOException { - if (scoreMode == ScoreMode.TOP_SCORES) { - // Update score bounds if necessary so - updateMaxScoresIfNecessary(target); - assert upTo >= target; - - // updateMaxScores tries to move forward until a block with matches is found - // so if the head is empty it means there are no matches at all anymore - if (head.size() == 0) { - assert upTo == DocIdSetIterator.NO_MORE_DOCS; - doc = DocIdSetIterator.NO_MORE_DOCS; - return; - } - } - + private void moveToNextCandidate() throws IOException { // The top of `head` defines the next potential match // pop all documents which are on this doc lead = head.pop(); + assert doc == lead.doc; lead.next = null; leadMaxScore = lead.scaledMaxScore; freq = 1; - doc = lead.doc; while (head.size() > 0 && head.top().doc == doc) { addLead(head.pop()); } } - /** Move iterators to the tail until there is a potential match. */ - private int doNextCompetitiveCandidate() throws IOException { - while (leadMaxScore + tailMaxScore < minCompetitiveScore || freq + tailSize < minShouldMatch) { - // no match on doc is possible, move to the next potential match - pushBackLeads(doc + 1); - moveToNextCandidate(doc + 1); - assert ensureConsistent(); - if (doc == DocIdSetIterator.NO_MORE_DOCS) { - break; - } - } - - return doc; - } - /** Advance all entries from the tail to know about all matches on the current doc. */ private void advanceAllTail() throws IOException { // we return the next doc when the sum of the scores of the potential