LUCENE-6815: Make DisjunctionScorer advance lazily.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1719470 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2015-12-11 16:30:10 +00:00
parent 8575e3fb3a
commit 1967050202
6 changed files with 232 additions and 128 deletions

View File

@ -136,6 +136,10 @@ Optimizations
* LUCENE-6912: Grouping's Collectors now calculate a response to needsScores()
instead of always 'true'. (David Smiley)
* LUCENE-6815: DisjunctionScorer now advances two-phased iterators lazily,
stopping to evaluate them as soon as a single one matches. The other iterators
will be confirmed lazily when computing score() or freq(). (Adrien Grand)
Bug Fixes
* LUCENE-6918: LRUQueryCache.onDocIdSetEviction is only called when at least

View File

@ -27,6 +27,7 @@ public class DisiWrapper {
public final DocIdSetIterator iterator;
public final Scorer scorer;
public final long cost;
public final float matchCost; // the match cost for two-phase iterators, 0 otherwise
public int doc; // the current doc, used for comparison
public DisiWrapper next; // reference to a next element, see #topList
@ -52,8 +53,10 @@ public class DisiWrapper {
if (twoPhaseView != null) {
approximation = twoPhaseView.approximation();
matchCost = twoPhaseView.matchCost();
} else {
approximation = iterator;
matchCost = 0f;
}
}
@ -67,8 +70,10 @@ public class DisiWrapper {
if (twoPhaseView != null) {
approximation = twoPhaseView.approximation();
matchCost = twoPhaseView.matchCost();
} else {
approximation = iterator;
matchCost = 0f;
}
this.lastApproxNonMatchDoc = -2;
this.lastApproxMatchDoc = -2;

View File

@ -22,17 +22,18 @@ import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.apache.lucene.util.PriorityQueue;
/**
* Base class for Scorers that score disjunctions.
*/
abstract class DisjunctionScorer extends Scorer {
private final boolean needsScores;
final DisiPriorityQueue subScorers;
private final long cost;
/** Linked list of scorers which are on the current doc */
private DisiWrapper topScorers;
private final DisiPriorityQueue subScorers;
private final DisjunctionDISIApproximation approximation;
private final TwoPhase twoPhase;
protected DisjunctionScorer(Weight weight, List<Scorer> subScorers, boolean needsScores) {
super(weight);
@ -40,125 +41,125 @@ abstract class DisjunctionScorer extends Scorer {
throw new IllegalArgumentException("There must be at least 2 subScorers");
}
this.subScorers = new DisiPriorityQueue(subScorers.size());
long cost = 0;
for (Scorer scorer : subScorers) {
final DisiWrapper w = new DisiWrapper(scorer);
cost += w.cost;
this.subScorers.add(w);
}
this.cost = cost;
this.needsScores = needsScores;
this.approximation = new DisjunctionDISIApproximation(this.subScorers);
boolean hasApproximation = false;
float sumMatchCost = 0;
long sumApproxCost = 0;
// Compute matchCost as the average over the matchCost of the subScorers.
// This is weighted by the cost, which is an expected number of matching documents.
for (DisiWrapper w : this.subScorers) {
long costWeight = (w.cost <= 1) ? 1 : w.cost;
sumApproxCost += costWeight;
if (w.twoPhaseView != null) {
hasApproximation = true;
sumMatchCost += w.matchCost * costWeight;
}
}
if (hasApproximation == false) { // no sub scorer supports approximations
twoPhase = null;
} else {
final float matchCost = sumMatchCost / sumApproxCost;
twoPhase = new TwoPhase(approximation, matchCost);
}
}
@Override
public DocIdSetIterator iterator() {
return new DocIdSetIterator() {
@Override
public int docID() {
return subScorers.top().doc;
}
@Override
public final int nextDoc() throws IOException {
topScorers = null;
DisiWrapper top = subScorers.top();
final int doc = top.doc;
do {
top.doc = top.iterator.nextDoc();
top = subScorers.updateTop();
} while (top.doc == doc);
return top.doc;
}
@Override
public final int advance(int target) throws IOException {
topScorers = null;
DisiWrapper top = subScorers.top();
do {
top.doc = top.iterator.advance(target);
top = subScorers.updateTop();
} while (top.doc < target);
return top.doc;
}
@Override
public final long cost() {
return cost;
}
};
if (twoPhase != null) {
return TwoPhaseIterator.asDocIdSetIterator(twoPhase);
} else {
return approximation;
}
}
@Override
public TwoPhaseIterator twoPhaseIterator() {
float sumMatchCost = 0;
long sumApproxCost = 0;
return twoPhase;
}
// Compute matchCost as the avarage over the matchCost of the subScorers.
// This is weighted by the cost, which is an expected number of matching documents.
for (DisiWrapper w : subScorers) {
if (w.twoPhaseView != null) {
long costWeight = (w.cost <= 1) ? 1 : w.cost;
sumMatchCost += w.twoPhaseView.matchCost() * costWeight;
sumApproxCost += costWeight;
}
}
private class TwoPhase extends TwoPhaseIterator {
if (sumApproxCost == 0) { // no sub scorer supports approximations
return null;
}
private final float matchCost;
// list of verified matches on the current doc
DisiWrapper verifiedMatches;
// priority queue of approximations on the current doc that have not been verified yet
final PriorityQueue<DisiWrapper> unverifiedMatches;
final float matchCost = sumMatchCost / sumApproxCost;
// note it is important to share the same pq as this scorer so that
// rebalancing the pq through the approximation will also rebalance
// the pq in this scorer.
return new TwoPhaseIterator(new DisjunctionDISIApproximation(subScorers)) {
@Override
public boolean matches() throws IOException {
DisiWrapper topScorers = subScorers.topList();
// remove the head of the list as long as it does not match
while (topScorers.twoPhaseView != null && ! topScorers.twoPhaseView.matches()) {
topScorers = topScorers.next;
if (topScorers == null) {
return false;
}
private TwoPhase(DocIdSetIterator approximation, float matchCost) {
super(approximation);
this.matchCost = matchCost;
unverifiedMatches = new PriorityQueue<DisiWrapper>(DisjunctionScorer.this.subScorers.size()) {
@Override
protected boolean lessThan(DisiWrapper a, DisiWrapper b) {
return a.matchCost < b.matchCost;
}
// now we know we have at least one match since the first element of 'matchList' matches
if (needsScores) {
// if scores or freqs are needed, we also need to remove scorers
// from the top list that do not actually match
DisiWrapper previous = topScorers;
for (DisiWrapper w = topScorers.next; w != null; w = w.next) {
if (w.twoPhaseView != null && ! w.twoPhaseView.matches()) {
// w does not match, remove it
previous.next = w.next;
} else {
previous = w;
}
};
}
DisiWrapper getSubMatches() throws IOException {
// iteration order does not matter
for (DisiWrapper w : unverifiedMatches) {
if (w.twoPhaseView.matches()) {
w.next = verifiedMatches;
verifiedMatches = w;
}
}
unverifiedMatches.clear();
return verifiedMatches;
}
@Override
public boolean matches() throws IOException {
verifiedMatches = null;
unverifiedMatches.clear();
for (DisiWrapper w = subScorers.topList(); w != null; ) {
DisiWrapper next = w.next;
if (w.twoPhaseView == null) {
// implicitly verified, move it to verifiedMatches
w.next = verifiedMatches;
verifiedMatches = w;
if (needsScores == false) {
// we can stop here
return true;
}
} else {
// since we don't need scores, let's pretend we have a single match
topScorers.next = null;
unverifiedMatches.add(w);
}
// We need to explicitely set the list of top scorers to avoid the
// laziness of DisjunctionScorer.score() that would take all scorers
// positioned on the same doc as the top of the pq, including
// non-matching scorers
DisjunctionScorer.this.topScorers = topScorers;
w = next;
}
if (verifiedMatches != null) {
return true;
}
@Override
public float matchCost() {
return matchCost;
// verify subs that have an two-phase iterator
// least-costly ones first
while (unverifiedMatches.size() > 0) {
DisiWrapper w = unverifiedMatches.pop();
if (w.twoPhaseView.matches()) {
w.next = null;
verifiedMatches = w;
return true;
}
}
};
return false;
}
@Override
public float matchCost() {
return matchCost;
}
}
@Override
@ -166,13 +167,19 @@ abstract class DisjunctionScorer extends Scorer {
return subScorers.top().doc;
}
DisiWrapper getSubMatches() throws IOException {
if (twoPhase == null) {
return subScorers.topList();
} else {
return twoPhase.getSubMatches();
}
}
@Override
public final int freq() throws IOException {
if (topScorers == null) {
topScorers = subScorers.topList();
}
DisiWrapper subMatches = getSubMatches();
int freq = 1;
for (DisiWrapper w = topScorers.next; w != null; w = w.next) {
for (DisiWrapper w = subMatches.next; w != null; w = w.next) {
freq += 1;
}
return freq;
@ -180,10 +187,7 @@ abstract class DisjunctionScorer extends Scorer {
@Override
public final float score() throws IOException {
if (topScorers == null) {
topScorers = subScorers.topList();
}
return score(topScorers);
return score(getSubMatches());
}
/** Compute the score for the given linked list of scorers. */

View File

@ -167,7 +167,7 @@ public final class SynonymQuery extends Query {
final float freq;
if (scorer instanceof SynonymScorer) {
SynonymScorer synScorer = (SynonymScorer) scorer;
freq = synScorer.tf(synScorer.subScorers.topList());
freq = synScorer.tf(synScorer.getSubMatches());
} else {
assert scorer instanceof TermScorer;
freq = scorer.freq();

View File

@ -1,5 +1,8 @@
package org.apache.lucene.util;
import java.util.Iterator;
import java.util.NoSuchElementException;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -27,10 +30,12 @@ package org.apache.lucene.util;
* <code>maxSize+1</code> if instantiated via the
* {@link #PriorityQueue(int,boolean)} constructor with <code>prepopulate</code>
* set to <code>true</code>.
*
*
* <b>NOTE</b>: Iteration order is not specified.
*
* @lucene.internal
*/
public abstract class PriorityQueue<T> {
public abstract class PriorityQueue<T> implements Iterable<T> {
private int size = 0;
private final int maxSize;
private final T[] heap;
@ -58,7 +63,7 @@ public abstract class PriorityQueue<T> {
@SuppressWarnings("unchecked") final T[] h = (T[]) new Object[heapSize];
this.heap = h;
this.maxSize = maxSize;
if (prepopulate) {
// If sentinel objects are supported, populate the queue with them
T sentinel = getSentinelObject();
@ -80,41 +85,41 @@ public abstract class PriorityQueue<T> {
/**
* This method can be overridden by extending classes to return a sentinel
* object which will be used by the {@link PriorityQueue#PriorityQueue(int,boolean)}
* object which will be used by the {@link PriorityQueue#PriorityQueue(int,boolean)}
* constructor to fill the queue, so that the code which uses that queue can always
* assume it's full and only change the top without attempting to insert any new
* object.<br>
*
*
* Those sentinel values should always compare worse than any non-sentinel
* value (i.e., {@link #lessThan} should always favor the
* non-sentinel values).<br>
*
*
* By default, this method returns null, which means the queue will not be
* filled with sentinel values. Otherwise, the value returned will be used to
* pre-populate the queue. Adds sentinel values to the queue.<br>
*
*
* If this method is extended to return a non-null value, then the following
* usage pattern is recommended:
*
*
* <pre class="prettyprint">
* // extends getSentinelObject() to return a non-null value.
* PriorityQueue&lt;MyObject&gt; pq = new MyQueue&lt;MyObject&gt;(numHits);
* // save the 'top' element, which is guaranteed to not be null.
* MyObject pqTop = pq.top();
* &lt;...&gt;
* // now in order to add a new element, which is 'better' than top (after
* // now in order to add a new element, which is 'better' than top (after
* // you've verified it is better), it is as simple as:
* pqTop.change().
* pqTop = pq.updateTop();
* </pre>
*
*
* <b>NOTE:</b> if this method returns a non-null value, it will be called by
* the {@link PriorityQueue#PriorityQueue(int,boolean)} constructor
* the {@link PriorityQueue#PriorityQueue(int,boolean)} constructor
* {@link #size()} times, relying on a new object to be returned and will not
* check if it's null again. Therefore you should ensure any call to this
* method creates a new instance and behaves consistently, e.g., it cannot
* return null if it previously returned non-null.
*
*
* @return the sentinel object to use to pre-populate the queue, or null if
* sentinel objects are not supported.
*/
@ -126,7 +131,7 @@ public abstract class PriorityQueue<T> {
* Adds an Object to a PriorityQueue in log(size) time. If one tries to add
* more objects than maxSize from initialize an
* {@link ArrayIndexOutOfBoundsException} is thrown.
*
*
* @return the new 'top' element in the queue.
*/
public final T add(T element) {
@ -182,24 +187,24 @@ public abstract class PriorityQueue<T> {
return null;
}
}
/**
* Should be called when the Object at top changes values. Still log(n) worst
* case, but it's at least twice as fast to
*
*
* <pre class="prettyprint">
* pq.top().change();
* pq.updateTop();
* </pre>
*
*
* instead of
*
*
* <pre class="prettyprint">
* o = pq.pop();
* o.change();
* pq.push(o);
* </pre>
*
*
* @return the new 'top' element.
*/
public final T updateTop() {
@ -263,7 +268,7 @@ public abstract class PriorityQueue<T> {
heap[i] = node; // install saved node
return i != origPos;
}
private final void downHeap(int i) {
T node = heap[i]; // save top node
int j = i << 1; // find smaller child
@ -289,4 +294,26 @@ public abstract class PriorityQueue<T> {
protected final Object[] getHeapArray() {
return (Object[]) heap;
}
@Override
public Iterator<T> iterator() {
return new Iterator<T>() {
int i = 1;
@Override
public boolean hasNext() {
return i <= size;
}
@Override
public T next() {
if (hasNext() == false) {
throw new NoSuchElementException();
}
return heap[i++];
}
};
}
}

View File

@ -18,6 +18,9 @@ package org.apache.lucene.util;
*/
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Random;
public class TestPriorityQueue extends LuceneTestCase {
@ -188,4 +191,65 @@ public class TestPriorityQueue extends LuceneTestCase {
}
}
public void testIterator() {
IntegerQueue queue = new IntegerQueue(3);
Iterator<Integer> it = queue.iterator();
assertFalse(it.hasNext());
try {
it.next();
fail();
} catch (NoSuchElementException e) {
// ok
}
queue.add(1);
it = queue.iterator();
assertTrue(it.hasNext());
assertEquals(Integer.valueOf(1), it.next());
assertFalse(it.hasNext());
try {
it.next();
fail();
} catch (NoSuchElementException e) {
// ok
}
queue.add(2);
it = queue.iterator();
assertTrue(it.hasNext());
assertEquals(Integer.valueOf(1), it.next());
assertTrue(it.hasNext());
assertEquals(Integer.valueOf(2), it.next());
assertFalse(it.hasNext());
try {
it.next();
fail();
} catch (NoSuchElementException e) {
// ok
}
}
public void testIteratorRandom() {
final int maxSize = TestUtil.nextInt(random(), 1, 20);
IntegerQueue queue = new IntegerQueue(maxSize);
final int iters = atLeast(100);
final List<Integer> expected = new ArrayList<>();
for (int iter = 0; iter < iters; ++iter) {
if (queue.size() == 0 || (queue.size() < maxSize && random().nextBoolean())) {
final Integer value = new Integer(random().nextInt(10));
queue.add(value);
expected.add(value);
} else {
expected.remove(queue.pop());
}
List<Integer> actual = new ArrayList<>();
for (Integer value : queue) {
actual.add(value);
}
CollectionUtil.introSort(expected);
CollectionUtil.introSort(actual);
assertEquals(expected, actual);
}
}
}