DisjunctionSumScorer performance improvement: LUCENE-365

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@465114 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2006-10-18 00:56:08 +00:00
parent b75358ebc9
commit 901c8c379c
3 changed files with 304 additions and 64 deletions

View File

@ -169,6 +169,9 @@ Optimizations
any BooleanQuery with more than one mandatory clause. any BooleanQuery with more than one mandatory clause.
(Abdul Chaudhry, Paul Elschot via Yonik Seeley) (Abdul Chaudhry, Paul Elschot via Yonik Seeley)
9. LUCENE-365: DisjunctionSumScorer performance increase of ~30%. Speeds up
queries with optional clauses. (Paul Elschot via Yonik Seeley)
Test Cases Test Cases
1. Added TestTermScorer.java (Grant Ingersoll) 1. Added TestTermScorer.java (Grant Ingersoll)

View File

@ -20,10 +20,11 @@ import java.util.List;
import java.util.Iterator; import java.util.Iterator;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.ScorerDocQueue;
/** A Scorer for OR like queries, counterpart of Lucene's <code>ConjunctionScorer</code>. /** A Scorer for OR like queries, counterpart of <code>ConjunctionScorer</code>.
* This Scorer implements {@link Scorer#skipTo(int)} and uses skipTo() on the given Scorers. * This Scorer implements {@link Scorer#skipTo(int)} and uses skipTo() on the given Scorers.
* @todo Implement score(HitCollector, int).
*/ */
class DisjunctionSumScorer extends Scorer { class DisjunctionSumScorer extends Scorer {
/** The number of subscorers. */ /** The number of subscorers. */
@ -35,19 +36,20 @@ class DisjunctionSumScorer extends Scorer {
/** The minimum number of scorers that should match. */ /** The minimum number of scorers that should match. */
private final int minimumNrMatchers; private final int minimumNrMatchers;
/** The scorerQueue contains all subscorers ordered by their current doc(), /** The scorerDocQueue contains all subscorers ordered by their current doc(),
* with the minimum at the top. * with the minimum at the top.
* <br>The scorerQueue is initialized the first time next() or skipTo() is called. * <br>The scorerDocQueue is initialized the first time next() or skipTo() is called.
* <br>An exhausted scorer is immediately removed from the scorerQueue. * <br>An exhausted scorer is immediately removed from the scorerDocQueue.
* <br>If less than the minimumNrMatchers scorers * <br>If less than the minimumNrMatchers scorers
* remain in the scorerQueue next() and skipTo() return false. * remain in the scorerDocQueue next() and skipTo() return false.
* <p> * <p>
* After each to call to next() or skipTo() * After each to call to next() or skipTo()
* <code>currentSumScore</code> is the total score of the current matching doc, * <code>currentSumScore</code> is the total score of the current matching doc,
* <code>nrMatchers</code> is the number of matching scorers, * <code>nrMatchers</code> is the number of matching scorers,
* and all scorers are after the matching doc, or are exhausted. * and all scorers are after the matching doc, or are exhausted.
*/ */
private ScorerQueue scorerQueue = null; private ScorerDocQueue scorerDocQueue = null;
private int queueSize = -1; // used to avoid size() method calls on scorerDocQueue
/** The document number of the current match. */ /** The document number of the current match. */
private int currentDoc = -1; private int currentDoc = -1;
@ -91,47 +93,65 @@ class DisjunctionSumScorer extends Scorer {
} }
/** Called the first time next() or skipTo() is called to /** Called the first time next() or skipTo() is called to
* initialize <code>scorerQueue</code>. * initialize <code>scorerDocQueue</code>.
*/ */
private void initScorerQueue() throws IOException { private void initScorerDocQueue() throws IOException {
Iterator si = subScorers.iterator(); Iterator si = subScorers.iterator();
scorerQueue = new ScorerQueue(nrScorers); scorerDocQueue = new ScorerDocQueue(nrScorers);
queueSize = 0;
while (si.hasNext()) { while (si.hasNext()) {
Scorer se = (Scorer) si.next(); Scorer se = (Scorer) si.next();
if (se.next()) { // doc() method will be used in scorerQueue. if (se.next()) { // doc() method will be used in scorerDocQueue.
scorerQueue.insert(se); if (scorerDocQueue.insert(se)) {
queueSize++;
}
} }
} }
} }
/** A <code>PriorityQueue</code> that orders by {@link Scorer#doc()}. */ /** Scores and collects all matching documents.
private class ScorerQueue extends PriorityQueue { * @param hc The collector to which all matching documents are passed through
ScorerQueue(int size) { * {@link HitCollector#collect(int, float)}.
initialize(size); * <br>When this method is used the {@link #explain(int)} method should not be used.
*/
public void score(HitCollector hc) throws IOException {
while (next()) {
hc.collect(currentDoc, currentScore);
}
} }
protected boolean lessThan(Object o1, Object o2) { /** Expert: Collects matching documents in a range. Hook for optimization.
return ((Scorer)o1).doc() < ((Scorer)o2).doc(); * Note that {@link #next()} must be called once before this method is called
* for the first time.
* @param hc The collector to which all matching documents are passed through
* {@link HitCollector#collect(int, float)}.
* @param max Do not score documents past this.
* @return true if more matching documents may remain.
*/
protected boolean score(HitCollector hc, int max) throws IOException {
while (currentDoc < max) {
hc.collect(currentDoc, currentScore);
if (!next()) {
return false;
} }
} }
return true;
}
public boolean next() throws IOException { public boolean next() throws IOException {
if (scorerQueue == null) { if (scorerDocQueue == null) {
initScorerQueue(); initScorerDocQueue();
}
if (scorerQueue.size() < minimumNrMatchers) {
return false;
} else {
return advanceAfterCurrent();
} }
return (scorerDocQueue.size() >= minimumNrMatchers)
&& advanceAfterCurrent();
} }
/** Advance all subscorers after the current document determined by the /** Advance all subscorers after the current document determined by the
* top of the <code>scorerQueue</code>. * top of the <code>scorerDocQueue</code>.
* Repeat until at least the minimum number of subscorers match on the same * Repeat until at least the minimum number of subscorers match on the same
* document and all subscorers are after that document or are exhausted. * document and all subscorers are after that document or are exhausted.
* <br>On entry the <code>scorerQueue</code> has at least <code>minimumNrMatchers</code> * <br>On entry the <code>scorerDocQueue</code> has at least <code>minimumNrMatchers</code>
* available. At least the scorer with the minimum document number will be advanced. * available. At least the scorer with the minimum document number will be advanced.
* @return true iff there is a match. * @return true iff there is a match.
* <br>In case there is a match, </code>currentDoc</code>, </code>currentSumScore</code>, * <br>In case there is a match, </code>currentDoc</code>, </code>currentSumScore</code>,
@ -140,39 +160,32 @@ class DisjunctionSumScorer extends Scorer {
* @todo Investigate whether it is possible to use skipTo() when * @todo Investigate whether it is possible to use skipTo() when
* the minimum number of matchers is bigger than one, ie. try and use the * the minimum number of matchers is bigger than one, ie. try and use the
* character of ConjunctionScorer for the minimum number of matchers. * character of ConjunctionScorer for the minimum number of matchers.
* Also delay calling score() on the sub scorers until the minimum number of
* matchers is reached.
* <br>For this, a Scorer array with minimumNrMatchers elements might
* hold Scorers at currentDoc that are temporarily popped from scorerQueue.
*/ */
protected boolean advanceAfterCurrent() throws IOException { protected boolean advanceAfterCurrent() throws IOException {
do { // repeat until minimum nr of matchers do { // repeat until minimum nr of matchers
Scorer top = (Scorer) scorerQueue.top(); currentDoc = scorerDocQueue.topDoc();
currentDoc = top.doc(); currentScore = scorerDocQueue.topScore();
currentScore = top.score();
nrMatchers = 1; nrMatchers = 1;
do { // Until all subscorers are after currentDoc do { // Until all subscorers are after currentDoc
if (top.next()) { if (! scorerDocQueue.topNextAndAdjustElsePop()) {
scorerQueue.adjustTop(); if (--queueSize == 0) {
} else {
scorerQueue.pop();
if (scorerQueue.size() < (minimumNrMatchers - nrMatchers)) {
// Not enough subscorers left for a match on this document,
// and also no more chance of any further match.
return false;
}
if (scorerQueue.size() == 0) {
break; // nothing more to advance, check for last match. break; // nothing more to advance, check for last match.
} }
} }
top = (Scorer) scorerQueue.top(); if (scorerDocQueue.topDoc() != currentDoc) {
if (top.doc() != currentDoc) {
break; // All remaining subscorers are after currentDoc. break; // All remaining subscorers are after currentDoc.
} else {
currentScore += top.score();
nrMatchers++;
} }
currentScore += scorerDocQueue.topScore();
nrMatchers++;
} while (true); } while (true);
if (nrMatchers >= minimumNrMatchers) { if (nrMatchers >= minimumNrMatchers) {
return true; return true;
} else if (scorerQueue.size() < minimumNrMatchers) { } else if (queueSize < minimumNrMatchers) {
return false; return false;
} }
} while (true); } while (true);
@ -200,39 +213,49 @@ class DisjunctionSumScorer extends Scorer {
* @return true iff there is such a match. * @return true iff there is such a match.
*/ */
public boolean skipTo(int target) throws IOException { public boolean skipTo(int target) throws IOException {
if (scorerQueue == null) { if (scorerDocQueue == null) {
initScorerQueue(); initScorerDocQueue();
} }
if (scorerQueue.size() < minimumNrMatchers) { if (queueSize < minimumNrMatchers) {
return false; return false;
} }
if (target <= currentDoc) { if (target <= currentDoc) {
return true; return true;
} }
do { do {
Scorer top = (Scorer) scorerQueue.top(); if (scorerDocQueue.topDoc() >= target) {
if (top.doc() >= target) {
return advanceAfterCurrent(); return advanceAfterCurrent();
} else if (top.skipTo(target)) { } else if (! scorerDocQueue.topSkipToAndAdjustElsePop(target)) {
scorerQueue.adjustTop(); if (--queueSize < minimumNrMatchers) {
} else {
scorerQueue.pop();
if (scorerQueue.size() < minimumNrMatchers) {
return false; return false;
} }
} }
} while (true); } while (true);
} }
/** Gives and explanation for the score of a given document. /** @return An explanation for the score of a given document. */
* @todo Show the resulting score. See BooleanScorer.explain() on how to do this.
*/
public Explanation explain(int doc) throws IOException { public Explanation explain(int doc) throws IOException {
Explanation res = new Explanation(); Explanation res = new Explanation();
res.setDescription("At least " + minimumNrMatchers + " of");
Iterator ssi = subScorers.iterator(); Iterator ssi = subScorers.iterator();
float sumScore = 0.0f;
int nrMatches = 0;
while (ssi.hasNext()) { while (ssi.hasNext()) {
res.addDetail( ((Scorer) ssi.next()).explain(doc)); Explanation es = ((Scorer) ssi.next()).explain(doc);
if (es.getValue() > 0.0f) { // indicates match
sumScore += es.getValue();
nrMatches++;
}
res.addDetail(es);
}
if (nrMatchers >= minimumNrMatchers) {
res.setValue(sumScore);
res.setDescription("sum over at least " + minimumNrMatchers
+ " of " + subScorers.size() + ":");
} else {
res.setValue(0.0f);
res.setDescription(nrMatches + " match(es) but at least "
+ minimumNrMatchers + " of "
+ subScorers.size() + " needed");
} }
return res; return res;
} }

View File

@ -0,0 +1,214 @@
package org.apache.lucene.util;
/**
* Copyright 2006 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Derived from org.apache.lucene.util.PriorityQueue of March 2005 */
import java.io.IOException;
import org.apache.lucene.search.Scorer;
/** A ScorerDocQueue maintains a partial ordering of its Scorers such that the
least Scorer can always be found in constant time. Put()'s and pop()'s
require log(size) time. The ordering is by Scorer.doc().
*/
public class ScorerDocQueue { // later: SpansQueue for spans with doc and term positions
private final HeapedScorerDoc[] heap;
private final int maxSize;
private int size;
private class HeapedScorerDoc {
Scorer scorer;
int doc;
HeapedScorerDoc(Scorer s) { this(s, s.doc()); }
HeapedScorerDoc(Scorer scorer, int doc) {
this.scorer = scorer;
this.doc = doc;
}
void adjust() { doc = scorer.doc(); }
}
private HeapedScorerDoc topHSD; // same as heap[1], only for speed
/** Create a ScorerDocQueue with a maximum size. */
public ScorerDocQueue(int maxSize) {
// assert maxSize >= 0;
size = 0;
int heapSize = maxSize + 1;
heap = new HeapedScorerDoc[heapSize];
this.maxSize = maxSize;
topHSD = heap[1]; // initially null
}
/**
* Adds a Scorer to a ScorerDocQueue in log(size) time.
* If one tries to add more Scorers than maxSize
* a RuntimeException (ArrayIndexOutOfBound) is thrown.
*/
public final void put(Scorer scorer) {
size++;
heap[size] = new HeapedScorerDoc(scorer);
upHeap();
}
/**
* Adds a Scorer to the ScorerDocQueue in log(size) time if either
* the ScorerDocQueue is not full, or not lessThan(scorer, top()).
* @param scorer
* @return true if scorer is added, false otherwise.
*/
public boolean insert(Scorer scorer){
if (size < maxSize) {
put(scorer);
return true;
} else {
int docNr = scorer.doc();
if ((size > 0) && (! (docNr < topHSD.doc))) { // heap[1] is top()
heap[1] = new HeapedScorerDoc(scorer, docNr);
downHeap();
return true;
} else {
return false;
}
}
}
/** Returns the least Scorer of the ScorerDocQueue in constant time.
* Should not be used when the queue is empty.
*/
public final Scorer top() {
// assert size > 0;
return topHSD.scorer;
}
/** Returns document number of the least Scorer of the ScorerDocQueue
* in constant time.
* Should not be used when the queue is empty.
*/
public final int topDoc() {
// assert size > 0;
return topHSD.doc;
}
public final float topScore() throws IOException {
// assert size > 0;
return topHSD.scorer.score();
}
public final boolean topNextAndAdjustElsePop() throws IOException {
return checkAdjustElsePop( topHSD.scorer.next());
}
public final boolean topSkipToAndAdjustElsePop(int target) throws IOException {
return checkAdjustElsePop( topHSD.scorer.skipTo(target));
}
private boolean checkAdjustElsePop(boolean cond) {
if (cond) { // see also adjustTop
topHSD.doc = topHSD.scorer.doc();
} else { // see also popNoResult
heap[1] = heap[size]; // move last to first
heap[size] = null;
size--;
}
downHeap();
return cond;
}
/** Removes and returns the least scorer of the ScorerDocQueue in log(size)
* time.
* Should not be used when the queue is empty.
*/
public final Scorer pop() {
// assert size > 0;
Scorer result = topHSD.scorer;
popNoResult();
return result;
}
/** Removes the least scorer of the ScorerDocQueue in log(size) time.
* Should not be used when the queue is empty.
*/
private final void popNoResult() {
heap[1] = heap[size]; // move last to first
heap[size] = null;
size--;
downHeap(); // adjust heap
}
/** Should be called when the scorer at top changes doc() value.
* Still log(n) worst case, but it's at least twice as fast to <pre>
* { pq.top().change(); pq.adjustTop(); }
* </pre> instead of <pre>
* { o = pq.pop(); o.change(); pq.push(o); }
* </pre>
*/
public final void adjustTop() {
// assert size > 0;
topHSD.adjust();
downHeap();
}
/** Returns the number of scorers currently stored in the ScorerDocQueue. */
public final int size() {
return size;
}
/** Removes all entries from the ScorerDocQueue. */
public final void clear() {
for (int i = 0; i <= size; i++) {
heap[i] = null;
}
size = 0;
}
private final void upHeap() {
int i = size;
HeapedScorerDoc node = heap[i]; // save bottom node
int j = i >>> 1;
while ((j > 0) && (node.doc < heap[j].doc)) {
heap[i] = heap[j]; // shift parents down
i = j;
j = j >>> 1;
}
heap[i] = node; // install saved node
topHSD = heap[1];
}
private final void downHeap() {
int i = 1;
HeapedScorerDoc node = heap[i]; // save top node
int j = i << 1; // find smaller child
int k = j + 1;
if ((k <= size) && (heap[k].doc < heap[j].doc)) {
j = k;
}
while ((j <= size) && (heap[j].doc < node.doc)) {
heap[i] = heap[j]; // shift up child
i = j;
j = i << 1;
k = j + 1;
if (k <= size && (heap[k].doc < heap[j].doc)) {
j = k;
}
}
heap[i] = node; // install saved node
topHSD = heap[1];
}
}