From 40a4330b146bcc9b7c3413f870dc7d3f83d52da9 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Mon, 16 Feb 2015 18:27:25 +0000 Subject: [PATCH] LUCENE-6244: DisjunctionScorer propagates two-phase iterators of its sub scorers. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1660180 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 3 + .../apache/lucene/search/BooleanWeight.java | 9 +- .../lucene/search/DisjunctionMaxQuery.java | 6 +- .../lucene/search/DisjunctionMaxScorer.java | 7 +- .../lucene/search/DisjunctionScorer.java | 275 ++++++++++++------ .../lucene/search/DisjunctionSumScorer.java | 9 +- .../lucene/search/ScorerPriorityQueue.java | 13 + .../TestApproximationSearchEquivalence.java | 169 +++++++++++ .../lucene/search/TestBooleanQuery.java | 31 +- .../search/RandomApproximationQuery.java | 227 +++++++++++++++ .../search/SearchEquivalenceTestBase.java | 29 ++ 11 files changed, 670 insertions(+), 108 deletions(-) create mode 100644 lucene/core/src/test/org/apache/lucene/search/TestApproximationSearchEquivalence.java create mode 100644 lucene/test-framework/src/java/org/apache/lucene/search/RandomApproximationQuery.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 9af35336b0f..4ca6a3a0338 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -86,6 +86,9 @@ Optimizations positions lazily if the phrase query is in a conjunction with other queries. (Robert Muir, Adrien Grand) +* LUCENE-6244: Pure disjunctions now propagate two-phase iterators of the + wrapped scorers (see LUCENE-6198). (Adrien Grand, Robert Muir) + * LUCENE-6241: FSDirectory.listAll() doesnt filter out subdirectories anymore, for faster performance. Subdirectories don't matter to Lucene. If you need to filter out non-index files with some custom usage, you may want to look at diff --git a/lucene/core/src/java/org/apache/lucene/search/BooleanWeight.java b/lucene/core/src/java/org/apache/lucene/search/BooleanWeight.java index 2c5d6105098..6ad7397fced 100644 --- a/lucene/core/src/java/org/apache/lucene/search/BooleanWeight.java +++ b/lucene/core/src/java/org/apache/lucene/search/BooleanWeight.java @@ -376,10 +376,7 @@ public class BooleanWeight extends Weight { } else { float coords[] = new float[prohibited.size()+1]; Arrays.fill(coords, 1F); - return new ReqExclScorer(main, - new DisjunctionSumScorer(this, - prohibited.toArray(new Scorer[prohibited.size()]), - coords)); + return new ReqExclScorer(main, new DisjunctionSumScorer(this, prohibited, coords, false)); } } @@ -402,9 +399,7 @@ public class BooleanWeight extends Weight { if (minShouldMatch > 1) { return new MinShouldMatchSumScorer(this, optional, minShouldMatch, coords); } else { - return new DisjunctionSumScorer(this, - optional.toArray(new Scorer[optional.size()]), - coords); + return new DisjunctionSumScorer(this, optional, coords, needsScores); } } } diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java index cfa19add59f..9a8cec02e16 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java @@ -115,7 +115,8 @@ public class DisjunctionMaxQuery extends Query implements Iterable { protected class DisjunctionMaxWeight extends Weight { /** The Weights for our subqueries, in 1-1 correspondence with disjuncts */ - protected ArrayList weights = new ArrayList<>(); // The Weight's for our subqueries, in 1-1 correspondence with disjuncts + protected final ArrayList weights = new ArrayList<>(); // The Weight's for our subqueries, in 1-1 correspondence with disjuncts + private final boolean needsScores; /** Construct the Weight for this Query searched by searcher. Recursively construct subquery weights. */ public DisjunctionMaxWeight(IndexSearcher searcher, boolean needsScores) throws IOException { @@ -123,6 +124,7 @@ public class DisjunctionMaxQuery extends Query implements Iterable { for (Query disjunctQuery : disjuncts) { weights.add(disjunctQuery.createWeight(searcher, needsScores)); } + this.needsScores = needsScores; } /** Compute the sub of squared weights of us applied to our subqueries. Used for normalization. */ @@ -166,7 +168,7 @@ public class DisjunctionMaxQuery extends Query implements Iterable { // only one sub-scorer in this segment return scorers.get(0); } else { - return new DisjunctionMaxScorer(this, tieBreakerMultiplier, scorers.toArray(new Scorer[scorers.size()])); + return new DisjunctionMaxScorer(this, tieBreakerMultiplier, scorers, needsScores); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java index d36894769e8..e3e7ed74f03 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java @@ -17,6 +17,7 @@ package org.apache.lucene.search; */ import java.io.IOException; +import java.util.List; import org.apache.lucene.search.ScorerPriorityQueue.ScorerWrapper; @@ -41,13 +42,13 @@ final class DisjunctionMaxScorer extends DisjunctionScorer { * @param subScorers * The sub scorers this Scorer should iterate on */ - DisjunctionMaxScorer(Weight weight, float tieBreakerMultiplier, Scorer[] subScorers) { - super(weight, subScorers); + DisjunctionMaxScorer(Weight weight, float tieBreakerMultiplier, List subScorers, boolean needsScores) { + super(weight, subScorers, needsScores); this.tieBreakerMultiplier = tieBreakerMultiplier; } @Override - protected float score(ScorerWrapper topList, int freq) throws IOException { + protected float score(ScorerWrapper topList) throws IOException { float scoreSum = 0; float scoreMax = 0; for (ScorerWrapper w = topList; w != null; w = w.next) { diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionScorer.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionScorer.java index 4955f332ec0..f6e812d81c8 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisjunctionScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionScorer.java @@ -20,6 +20,7 @@ package org.apache.lucene.search; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; +import java.util.List; import org.apache.lucene.search.ScorerPriorityQueue.ScorerWrapper; import org.apache.lucene.util.BytesRef; @@ -29,27 +30,199 @@ import org.apache.lucene.util.BytesRef; */ abstract class DisjunctionScorer extends Scorer { + private final boolean needsScores; private final ScorerPriorityQueue subScorers; + private final long cost; - /** The document number of the current match. */ - protected int doc = -1; - protected int numScorers; - /** Number of matching scorers for the current match. */ - private int freq = -1; /** Linked list of scorers which are on the current doc */ private ScorerWrapper topScorers; - protected DisjunctionScorer(Weight weight, Scorer subScorers[]) { + protected DisjunctionScorer(Weight weight, List subScorers, boolean needsScores) { super(weight); - if (subScorers.length <= 1) { + if (subScorers.size() <= 1) { throw new IllegalArgumentException("There must be at least 2 subScorers"); } - this.subScorers = new ScorerPriorityQueue(subScorers.length); + this.subScorers = new ScorerPriorityQueue(subScorers.size()); + long cost = 0; for (Scorer scorer : subScorers) { - this.subScorers.add(new ScorerWrapper(scorer)); + final ScorerWrapper w = new ScorerWrapper(scorer); + cost += w.cost; + this.subScorers.add(w); + } + this.cost = cost; + this.needsScores = needsScores; + } + + /** + * A {@link DocIdSetIterator} which is a disjunction of the approximations of + * the provided iterators. + */ + private static class DisjunctionDISIApproximation extends DocIdSetIterator { + + final ScorerPriorityQueue subScorers; + final long cost; + + DisjunctionDISIApproximation(ScorerPriorityQueue subScorers) { + this.subScorers = subScorers; + long cost = 0; + for (ScorerWrapper w : subScorers) { + cost += w.cost; + } + this.cost = cost; + } + + @Override + public long cost() { + return cost; + } + + @Override + public int docID() { + return subScorers.top().doc; + } + + @Override + public int nextDoc() throws IOException { + ScorerWrapper top = subScorers.top(); + final int doc = top.doc; + do { + top.doc = top.approximation.nextDoc(); + top = subScorers.updateTop(); + } while (top.doc == doc); + + return top.doc; + } + + @Override + public int advance(int target) throws IOException { + ScorerWrapper top = subScorers.top(); + do { + top.doc = top.approximation.advance(target); + top = subScorers.updateTop(); + } while (top.doc < target); + + return top.doc; } } - + + @Override + public TwoPhaseDocIdSetIterator asTwoPhaseIterator() { + boolean hasApproximation = false; + for (ScorerWrapper w : subScorers) { + if (w.twoPhaseView != null) { + hasApproximation = true; + break; + } + } + + if (hasApproximation == false) { + // none of the sub scorers supports approximations + return null; + } + + return new TwoPhaseDocIdSetIterator() { + + @Override + public DocIdSetIterator approximation() { + // note it is important to share the same pq as this scorer so that + // rebalancing the pq through the approximation will also rebalance + // the pq in this scorer. + return new DisjunctionDISIApproximation(subScorers); + } + + @Override + public boolean matches() throws IOException { + ScorerWrapper topScorers = subScorers.topList(); + // remove the head of the list as long as it does not match + while (topScorers.twoPhaseView != null && topScorers.twoPhaseView.matches() == false) { + topScorers = topScorers.next; + if (topScorers == null) { + return false; + } + } + // now we know we have at least one match since the first element of 'matchList' matches + if (needsScores) { + // if scores or freqs are needed, we also need to remove scorers + // from the top list that do not actually match + ScorerWrapper previous = topScorers; + for (ScorerWrapper w = topScorers.next; w != null; w = w.next) { + if (w.twoPhaseView != null && w.twoPhaseView.matches() == false) { + // w does not match, remove it + previous.next = w.next; + } else { + previous = w; + } + } + + // We need to explicitely set the list of top scorers to avoid the + // laziness of DisjunctionScorer.score() that would take all scorers + // positioned on the same doc as the top of the pq, including + // non-matching scorers + DisjunctionScorer.this.topScorers = topScorers; + } + return true; + } + }; + } + + @Override + public final long cost() { + return cost; + } + + @Override + public final int docID() { + return subScorers.top().doc; + } + + @Override + public final int nextDoc() throws IOException { + topScorers = null; + ScorerWrapper top = subScorers.top(); + final int doc = top.doc; + do { + top.doc = top.scorer.nextDoc(); + top = subScorers.updateTop(); + } while (top.doc == doc); + + return top.doc; + } + + @Override + public final int advance(int target) throws IOException { + topScorers = null; + ScorerWrapper top = subScorers.top(); + do { + top.doc = top.scorer.advance(target); + top = subScorers.updateTop(); + } while (top.doc < target); + + return top.doc; + } + + @Override + public final int freq() throws IOException { + if (topScorers == null) { + topScorers = subScorers.topList(); + } + int freq = 1; + for (ScorerWrapper w = topScorers.next; w != null; w = w.next) { + freq += 1; + } + return freq; + } + + @Override + public final float score() throws IOException { + if (topScorers == null) { + topScorers = subScorers.topList(); + } + return score(topScorers); + } + + /** Compute the score for the given linked list of scorers. */ + protected abstract float score(ScorerWrapper topList) throws IOException; + @Override public final Collection getChildren() { ArrayList children = new ArrayList<>(); @@ -78,86 +251,4 @@ abstract class DisjunctionScorer extends Scorer { public BytesRef getPayload() throws IOException { return null; } - - @Override - public final long cost() { - long sum = 0; - for (ScorerWrapper scorer : subScorers) { - sum += scorer.cost; - } - return sum; - } - - @Override - public final int docID() { - return doc; - } - - @Override - public final int nextDoc() throws IOException { - assert doc != NO_MORE_DOCS; - - ScorerWrapper top = subScorers.top(); - final int doc = this.doc; - while (top.doc == doc) { - top.doc = top.scorer.nextDoc(); - if (top.doc == NO_MORE_DOCS) { - subScorers.pop(); - if (subScorers.size() == 0) { - return this.doc = NO_MORE_DOCS; - } - top = subScorers.top(); - } else { - top = subScorers.updateTop(); - } - } - - freq = -1; - return this.doc = top.doc; - } - - @Override - public final int advance(int target) throws IOException { - assert doc != NO_MORE_DOCS; - - ScorerWrapper top = subScorers.top(); - while (top.doc < target) { - top.doc = top.scorer.advance(target); - if (top.doc == NO_MORE_DOCS) { - subScorers.pop(); - if (subScorers.size() == 0) { - return this.doc = NO_MORE_DOCS; - } - top = subScorers.top(); - } else { - top = subScorers.updateTop(); - } - } - - freq = -1; - return this.doc = top.doc; - } - - @Override - public final int freq() throws IOException { - if (freq < 0) { - topScorers = subScorers.topList(); - int freq = 1; - for (ScorerWrapper w = topScorers.next; w != null; w = w.next) { - freq += 1; - } - this.freq = freq; - } - return freq; - } - - @Override - public final float score() throws IOException { - final int freq = freq(); // compute the top scorers if necessary - return score(topScorers, freq); - } - - /** Compute the score for the given linked list of scorers. */ - protected abstract float score(ScorerWrapper topList, int freq) throws IOException; - } diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionSumScorer.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionSumScorer.java index 185e012e6ba..d6b25b15568 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisjunctionSumScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionSumScorer.java @@ -18,6 +18,7 @@ package org.apache.lucene.search; */ import java.io.IOException; +import java.util.List; import org.apache.lucene.search.ScorerPriorityQueue.ScorerWrapper; @@ -32,16 +33,18 @@ final class DisjunctionSumScorer extends DisjunctionScorer { * @param subScorers Array of at least two subscorers. * @param coord Table of coordination factors */ - DisjunctionSumScorer(Weight weight, Scorer[] subScorers, float[] coord) { - super(weight, subScorers); + DisjunctionSumScorer(Weight weight, List subScorers, float[] coord, boolean needsScores) { + super(weight, subScorers, needsScores); this.coord = coord; } @Override - protected float score(ScorerWrapper topList, int freq) throws IOException { + protected float score(ScorerWrapper topList) throws IOException { double score = 0; + int freq = 0; for (ScorerWrapper w = topList; w != null; w = w.next) { score += w.scorer.score(); + freq += 1; } return (float)score * coord[freq]; } diff --git a/lucene/core/src/java/org/apache/lucene/search/ScorerPriorityQueue.java b/lucene/core/src/java/org/apache/lucene/search/ScorerPriorityQueue.java index 44379742052..c6a4d0adecd 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ScorerPriorityQueue.java +++ b/lucene/core/src/java/org/apache/lucene/search/ScorerPriorityQueue.java @@ -35,10 +35,23 @@ final class ScorerPriorityQueue implements Iterable