mirror of https://github.com/apache/lucene.git
LUCENE-6815: Make DisjunctionScorer advance lazily.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1719470 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8575e3fb3a
commit
1967050202
|
@ -136,6 +136,10 @@ Optimizations
|
|||
* LUCENE-6912: Grouping's Collectors now calculate a response to needsScores()
|
||||
instead of always 'true'. (David Smiley)
|
||||
|
||||
* LUCENE-6815: DisjunctionScorer now advances two-phased iterators lazily,
|
||||
stopping to evaluate them as soon as a single one matches. The other iterators
|
||||
will be confirmed lazily when computing score() or freq(). (Adrien Grand)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-6918: LRUQueryCache.onDocIdSetEviction is only called when at least
|
||||
|
|
|
@ -27,6 +27,7 @@ public class DisiWrapper {
|
|||
public final DocIdSetIterator iterator;
|
||||
public final Scorer scorer;
|
||||
public final long cost;
|
||||
public final float matchCost; // the match cost for two-phase iterators, 0 otherwise
|
||||
public int doc; // the current doc, used for comparison
|
||||
public DisiWrapper next; // reference to a next element, see #topList
|
||||
|
||||
|
@ -52,8 +53,10 @@ public class DisiWrapper {
|
|||
|
||||
if (twoPhaseView != null) {
|
||||
approximation = twoPhaseView.approximation();
|
||||
matchCost = twoPhaseView.matchCost();
|
||||
} else {
|
||||
approximation = iterator;
|
||||
matchCost = 0f;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -67,8 +70,10 @@ public class DisiWrapper {
|
|||
|
||||
if (twoPhaseView != null) {
|
||||
approximation = twoPhaseView.approximation();
|
||||
matchCost = twoPhaseView.matchCost();
|
||||
} else {
|
||||
approximation = iterator;
|
||||
matchCost = 0f;
|
||||
}
|
||||
this.lastApproxNonMatchDoc = -2;
|
||||
this.lastApproxMatchDoc = -2;
|
||||
|
|
|
@ -22,17 +22,18 @@ import java.util.ArrayList;
|
|||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
/**
|
||||
* Base class for Scorers that score disjunctions.
|
||||
*/
|
||||
abstract class DisjunctionScorer extends Scorer {
|
||||
|
||||
private final boolean needsScores;
|
||||
final DisiPriorityQueue subScorers;
|
||||
private final long cost;
|
||||
|
||||
/** Linked list of scorers which are on the current doc */
|
||||
private DisiWrapper topScorers;
|
||||
private final DisiPriorityQueue subScorers;
|
||||
private final DisjunctionDISIApproximation approximation;
|
||||
private final TwoPhase twoPhase;
|
||||
|
||||
protected DisjunctionScorer(Weight weight, List<Scorer> subScorers, boolean needsScores) {
|
||||
super(weight);
|
||||
|
@ -40,125 +41,125 @@ abstract class DisjunctionScorer extends Scorer {
|
|||
throw new IllegalArgumentException("There must be at least 2 subScorers");
|
||||
}
|
||||
this.subScorers = new DisiPriorityQueue(subScorers.size());
|
||||
long cost = 0;
|
||||
for (Scorer scorer : subScorers) {
|
||||
final DisiWrapper w = new DisiWrapper(scorer);
|
||||
cost += w.cost;
|
||||
this.subScorers.add(w);
|
||||
}
|
||||
this.cost = cost;
|
||||
this.needsScores = needsScores;
|
||||
this.approximation = new DisjunctionDISIApproximation(this.subScorers);
|
||||
|
||||
boolean hasApproximation = false;
|
||||
float sumMatchCost = 0;
|
||||
long sumApproxCost = 0;
|
||||
// Compute matchCost as the average over the matchCost of the subScorers.
|
||||
// This is weighted by the cost, which is an expected number of matching documents.
|
||||
for (DisiWrapper w : this.subScorers) {
|
||||
long costWeight = (w.cost <= 1) ? 1 : w.cost;
|
||||
sumApproxCost += costWeight;
|
||||
if (w.twoPhaseView != null) {
|
||||
hasApproximation = true;
|
||||
sumMatchCost += w.matchCost * costWeight;
|
||||
}
|
||||
}
|
||||
|
||||
if (hasApproximation == false) { // no sub scorer supports approximations
|
||||
twoPhase = null;
|
||||
} else {
|
||||
final float matchCost = sumMatchCost / sumApproxCost;
|
||||
twoPhase = new TwoPhase(approximation, matchCost);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocIdSetIterator iterator() {
|
||||
return new DocIdSetIterator() {
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return subScorers.top().doc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final int nextDoc() throws IOException {
|
||||
topScorers = null;
|
||||
DisiWrapper top = subScorers.top();
|
||||
final int doc = top.doc;
|
||||
do {
|
||||
top.doc = top.iterator.nextDoc();
|
||||
top = subScorers.updateTop();
|
||||
} while (top.doc == doc);
|
||||
|
||||
return top.doc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final int advance(int target) throws IOException {
|
||||
topScorers = null;
|
||||
DisiWrapper top = subScorers.top();
|
||||
do {
|
||||
top.doc = top.iterator.advance(target);
|
||||
top = subScorers.updateTop();
|
||||
} while (top.doc < target);
|
||||
|
||||
return top.doc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final long cost() {
|
||||
return cost;
|
||||
}
|
||||
|
||||
};
|
||||
if (twoPhase != null) {
|
||||
return TwoPhaseIterator.asDocIdSetIterator(twoPhase);
|
||||
} else {
|
||||
return approximation;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TwoPhaseIterator twoPhaseIterator() {
|
||||
float sumMatchCost = 0;
|
||||
long sumApproxCost = 0;
|
||||
return twoPhase;
|
||||
}
|
||||
|
||||
// Compute matchCost as the avarage over the matchCost of the subScorers.
|
||||
// This is weighted by the cost, which is an expected number of matching documents.
|
||||
for (DisiWrapper w : subScorers) {
|
||||
if (w.twoPhaseView != null) {
|
||||
long costWeight = (w.cost <= 1) ? 1 : w.cost;
|
||||
sumMatchCost += w.twoPhaseView.matchCost() * costWeight;
|
||||
sumApproxCost += costWeight;
|
||||
}
|
||||
}
|
||||
private class TwoPhase extends TwoPhaseIterator {
|
||||
|
||||
if (sumApproxCost == 0) { // no sub scorer supports approximations
|
||||
return null;
|
||||
}
|
||||
private final float matchCost;
|
||||
// list of verified matches on the current doc
|
||||
DisiWrapper verifiedMatches;
|
||||
// priority queue of approximations on the current doc that have not been verified yet
|
||||
final PriorityQueue<DisiWrapper> unverifiedMatches;
|
||||
|
||||
final float matchCost = sumMatchCost / sumApproxCost;
|
||||
|
||||
// note it is important to share the same pq as this scorer so that
|
||||
// rebalancing the pq through the approximation will also rebalance
|
||||
// the pq in this scorer.
|
||||
return new TwoPhaseIterator(new DisjunctionDISIApproximation(subScorers)) {
|
||||
|
||||
@Override
|
||||
public boolean matches() throws IOException {
|
||||
DisiWrapper topScorers = subScorers.topList();
|
||||
// remove the head of the list as long as it does not match
|
||||
while (topScorers.twoPhaseView != null && ! topScorers.twoPhaseView.matches()) {
|
||||
topScorers = topScorers.next;
|
||||
if (topScorers == null) {
|
||||
return false;
|
||||
}
|
||||
private TwoPhase(DocIdSetIterator approximation, float matchCost) {
|
||||
super(approximation);
|
||||
this.matchCost = matchCost;
|
||||
unverifiedMatches = new PriorityQueue<DisiWrapper>(DisjunctionScorer.this.subScorers.size()) {
|
||||
@Override
|
||||
protected boolean lessThan(DisiWrapper a, DisiWrapper b) {
|
||||
return a.matchCost < b.matchCost;
|
||||
}
|
||||
// now we know we have at least one match since the first element of 'matchList' matches
|
||||
if (needsScores) {
|
||||
// if scores or freqs are needed, we also need to remove scorers
|
||||
// from the top list that do not actually match
|
||||
DisiWrapper previous = topScorers;
|
||||
for (DisiWrapper w = topScorers.next; w != null; w = w.next) {
|
||||
if (w.twoPhaseView != null && ! w.twoPhaseView.matches()) {
|
||||
// w does not match, remove it
|
||||
previous.next = w.next;
|
||||
} else {
|
||||
previous = w;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
DisiWrapper getSubMatches() throws IOException {
|
||||
// iteration order does not matter
|
||||
for (DisiWrapper w : unverifiedMatches) {
|
||||
if (w.twoPhaseView.matches()) {
|
||||
w.next = verifiedMatches;
|
||||
verifiedMatches = w;
|
||||
}
|
||||
}
|
||||
unverifiedMatches.clear();
|
||||
return verifiedMatches;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean matches() throws IOException {
|
||||
verifiedMatches = null;
|
||||
unverifiedMatches.clear();
|
||||
|
||||
for (DisiWrapper w = subScorers.topList(); w != null; ) {
|
||||
DisiWrapper next = w.next;
|
||||
|
||||
if (w.twoPhaseView == null) {
|
||||
// implicitly verified, move it to verifiedMatches
|
||||
w.next = verifiedMatches;
|
||||
verifiedMatches = w;
|
||||
|
||||
if (needsScores == false) {
|
||||
// we can stop here
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
// since we don't need scores, let's pretend we have a single match
|
||||
topScorers.next = null;
|
||||
unverifiedMatches.add(w);
|
||||
}
|
||||
|
||||
// We need to explicitely set the list of top scorers to avoid the
|
||||
// laziness of DisjunctionScorer.score() that would take all scorers
|
||||
// positioned on the same doc as the top of the pq, including
|
||||
// non-matching scorers
|
||||
DisjunctionScorer.this.topScorers = topScorers;
|
||||
w = next;
|
||||
}
|
||||
|
||||
if (verifiedMatches != null) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float matchCost() {
|
||||
return matchCost;
|
||||
|
||||
// verify subs that have an two-phase iterator
|
||||
// least-costly ones first
|
||||
while (unverifiedMatches.size() > 0) {
|
||||
DisiWrapper w = unverifiedMatches.pop();
|
||||
if (w.twoPhaseView.matches()) {
|
||||
w.next = null;
|
||||
verifiedMatches = w;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float matchCost() {
|
||||
return matchCost;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -166,13 +167,19 @@ abstract class DisjunctionScorer extends Scorer {
|
|||
return subScorers.top().doc;
|
||||
}
|
||||
|
||||
DisiWrapper getSubMatches() throws IOException {
|
||||
if (twoPhase == null) {
|
||||
return subScorers.topList();
|
||||
} else {
|
||||
return twoPhase.getSubMatches();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public final int freq() throws IOException {
|
||||
if (topScorers == null) {
|
||||
topScorers = subScorers.topList();
|
||||
}
|
||||
DisiWrapper subMatches = getSubMatches();
|
||||
int freq = 1;
|
||||
for (DisiWrapper w = topScorers.next; w != null; w = w.next) {
|
||||
for (DisiWrapper w = subMatches.next; w != null; w = w.next) {
|
||||
freq += 1;
|
||||
}
|
||||
return freq;
|
||||
|
@ -180,10 +187,7 @@ abstract class DisjunctionScorer extends Scorer {
|
|||
|
||||
@Override
|
||||
public final float score() throws IOException {
|
||||
if (topScorers == null) {
|
||||
topScorers = subScorers.topList();
|
||||
}
|
||||
return score(topScorers);
|
||||
return score(getSubMatches());
|
||||
}
|
||||
|
||||
/** Compute the score for the given linked list of scorers. */
|
||||
|
|
|
@ -167,7 +167,7 @@ public final class SynonymQuery extends Query {
|
|||
final float freq;
|
||||
if (scorer instanceof SynonymScorer) {
|
||||
SynonymScorer synScorer = (SynonymScorer) scorer;
|
||||
freq = synScorer.tf(synScorer.subScorers.topList());
|
||||
freq = synScorer.tf(synScorer.getSubMatches());
|
||||
} else {
|
||||
assert scorer instanceof TermScorer;
|
||||
freq = scorer.freq();
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
package org.apache.lucene.util;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -27,10 +30,12 @@ package org.apache.lucene.util;
|
|||
* <code>maxSize+1</code> if instantiated via the
|
||||
* {@link #PriorityQueue(int,boolean)} constructor with <code>prepopulate</code>
|
||||
* set to <code>true</code>.
|
||||
*
|
||||
*
|
||||
* <b>NOTE</b>: Iteration order is not specified.
|
||||
*
|
||||
* @lucene.internal
|
||||
*/
|
||||
public abstract class PriorityQueue<T> {
|
||||
public abstract class PriorityQueue<T> implements Iterable<T> {
|
||||
private int size = 0;
|
||||
private final int maxSize;
|
||||
private final T[] heap;
|
||||
|
@ -58,7 +63,7 @@ public abstract class PriorityQueue<T> {
|
|||
@SuppressWarnings("unchecked") final T[] h = (T[]) new Object[heapSize];
|
||||
this.heap = h;
|
||||
this.maxSize = maxSize;
|
||||
|
||||
|
||||
if (prepopulate) {
|
||||
// If sentinel objects are supported, populate the queue with them
|
||||
T sentinel = getSentinelObject();
|
||||
|
@ -80,41 +85,41 @@ public abstract class PriorityQueue<T> {
|
|||
|
||||
/**
|
||||
* This method can be overridden by extending classes to return a sentinel
|
||||
* object which will be used by the {@link PriorityQueue#PriorityQueue(int,boolean)}
|
||||
* object which will be used by the {@link PriorityQueue#PriorityQueue(int,boolean)}
|
||||
* constructor to fill the queue, so that the code which uses that queue can always
|
||||
* assume it's full and only change the top without attempting to insert any new
|
||||
* object.<br>
|
||||
*
|
||||
*
|
||||
* Those sentinel values should always compare worse than any non-sentinel
|
||||
* value (i.e., {@link #lessThan} should always favor the
|
||||
* non-sentinel values).<br>
|
||||
*
|
||||
*
|
||||
* By default, this method returns null, which means the queue will not be
|
||||
* filled with sentinel values. Otherwise, the value returned will be used to
|
||||
* pre-populate the queue. Adds sentinel values to the queue.<br>
|
||||
*
|
||||
*
|
||||
* If this method is extended to return a non-null value, then the following
|
||||
* usage pattern is recommended:
|
||||
*
|
||||
*
|
||||
* <pre class="prettyprint">
|
||||
* // extends getSentinelObject() to return a non-null value.
|
||||
* PriorityQueue<MyObject> pq = new MyQueue<MyObject>(numHits);
|
||||
* // save the 'top' element, which is guaranteed to not be null.
|
||||
* MyObject pqTop = pq.top();
|
||||
* <...>
|
||||
* // now in order to add a new element, which is 'better' than top (after
|
||||
* // now in order to add a new element, which is 'better' than top (after
|
||||
* // you've verified it is better), it is as simple as:
|
||||
* pqTop.change().
|
||||
* pqTop = pq.updateTop();
|
||||
* </pre>
|
||||
*
|
||||
*
|
||||
* <b>NOTE:</b> if this method returns a non-null value, it will be called by
|
||||
* the {@link PriorityQueue#PriorityQueue(int,boolean)} constructor
|
||||
* the {@link PriorityQueue#PriorityQueue(int,boolean)} constructor
|
||||
* {@link #size()} times, relying on a new object to be returned and will not
|
||||
* check if it's null again. Therefore you should ensure any call to this
|
||||
* method creates a new instance and behaves consistently, e.g., it cannot
|
||||
* return null if it previously returned non-null.
|
||||
*
|
||||
*
|
||||
* @return the sentinel object to use to pre-populate the queue, or null if
|
||||
* sentinel objects are not supported.
|
||||
*/
|
||||
|
@ -126,7 +131,7 @@ public abstract class PriorityQueue<T> {
|
|||
* Adds an Object to a PriorityQueue in log(size) time. If one tries to add
|
||||
* more objects than maxSize from initialize an
|
||||
* {@link ArrayIndexOutOfBoundsException} is thrown.
|
||||
*
|
||||
*
|
||||
* @return the new 'top' element in the queue.
|
||||
*/
|
||||
public final T add(T element) {
|
||||
|
@ -182,24 +187,24 @@ public abstract class PriorityQueue<T> {
|
|||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Should be called when the Object at top changes values. Still log(n) worst
|
||||
* case, but it's at least twice as fast to
|
||||
*
|
||||
*
|
||||
* <pre class="prettyprint">
|
||||
* pq.top().change();
|
||||
* pq.updateTop();
|
||||
* </pre>
|
||||
*
|
||||
*
|
||||
* instead of
|
||||
*
|
||||
*
|
||||
* <pre class="prettyprint">
|
||||
* o = pq.pop();
|
||||
* o.change();
|
||||
* pq.push(o);
|
||||
* </pre>
|
||||
*
|
||||
*
|
||||
* @return the new 'top' element.
|
||||
*/
|
||||
public final T updateTop() {
|
||||
|
@ -263,7 +268,7 @@ public abstract class PriorityQueue<T> {
|
|||
heap[i] = node; // install saved node
|
||||
return i != origPos;
|
||||
}
|
||||
|
||||
|
||||
private final void downHeap(int i) {
|
||||
T node = heap[i]; // save top node
|
||||
int j = i << 1; // find smaller child
|
||||
|
@ -289,4 +294,26 @@ public abstract class PriorityQueue<T> {
|
|||
protected final Object[] getHeapArray() {
|
||||
return (Object[]) heap;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<T> iterator() {
|
||||
return new Iterator<T>() {
|
||||
|
||||
int i = 1;
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return i <= size;
|
||||
}
|
||||
|
||||
@Override
|
||||
public T next() {
|
||||
if (hasNext() == false) {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
return heap[i++];
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,6 +18,9 @@ package org.apache.lucene.util;
|
|||
*/
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Random;
|
||||
|
||||
public class TestPriorityQueue extends LuceneTestCase {
|
||||
|
@ -188,4 +191,65 @@ public class TestPriorityQueue extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public void testIterator() {
|
||||
IntegerQueue queue = new IntegerQueue(3);
|
||||
|
||||
Iterator<Integer> it = queue.iterator();
|
||||
assertFalse(it.hasNext());
|
||||
try {
|
||||
it.next();
|
||||
fail();
|
||||
} catch (NoSuchElementException e) {
|
||||
// ok
|
||||
}
|
||||
|
||||
queue.add(1);
|
||||
it = queue.iterator();
|
||||
assertTrue(it.hasNext());
|
||||
assertEquals(Integer.valueOf(1), it.next());
|
||||
assertFalse(it.hasNext());
|
||||
try {
|
||||
it.next();
|
||||
fail();
|
||||
} catch (NoSuchElementException e) {
|
||||
// ok
|
||||
}
|
||||
|
||||
queue.add(2);
|
||||
it = queue.iterator();
|
||||
assertTrue(it.hasNext());
|
||||
assertEquals(Integer.valueOf(1), it.next());
|
||||
assertTrue(it.hasNext());
|
||||
assertEquals(Integer.valueOf(2), it.next());
|
||||
assertFalse(it.hasNext());
|
||||
try {
|
||||
it.next();
|
||||
fail();
|
||||
} catch (NoSuchElementException e) {
|
||||
// ok
|
||||
}
|
||||
}
|
||||
|
||||
public void testIteratorRandom() {
|
||||
final int maxSize = TestUtil.nextInt(random(), 1, 20);
|
||||
IntegerQueue queue = new IntegerQueue(maxSize);
|
||||
final int iters = atLeast(100);
|
||||
final List<Integer> expected = new ArrayList<>();
|
||||
for (int iter = 0; iter < iters; ++iter) {
|
||||
if (queue.size() == 0 || (queue.size() < maxSize && random().nextBoolean())) {
|
||||
final Integer value = new Integer(random().nextInt(10));
|
||||
queue.add(value);
|
||||
expected.add(value);
|
||||
} else {
|
||||
expected.remove(queue.pop());
|
||||
}
|
||||
List<Integer> actual = new ArrayList<>();
|
||||
for (Integer value : queue) {
|
||||
actual.add(value);
|
||||
}
|
||||
CollectionUtil.introSort(expected);
|
||||
CollectionUtil.introSort(actual);
|
||||
assertEquals(expected, actual);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue