mirror of https://github.com/apache/lucene.git
DisjunctionSumScorer performance improvement: LUCENE-365
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@465114 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b75358ebc9
commit
901c8c379c
|
@ -169,6 +169,9 @@ Optimizations
|
|||
any BooleanQuery with more than one mandatory clause.
|
||||
(Abdul Chaudhry, Paul Elschot via Yonik Seeley)
|
||||
|
||||
9. LUCENE-365: DisjunctionSumScorer performance increase of ~30%. Speeds up
|
||||
queries with optional clauses. (Paul Elschot via Yonik Seeley)
|
||||
|
||||
Test Cases
|
||||
1. Added TestTermScorer.java (Grant Ingersoll)
|
||||
|
||||
|
|
|
@ -20,10 +20,11 @@ import java.util.List;
|
|||
import java.util.Iterator;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
import org.apache.lucene.util.ScorerDocQueue;
|
||||
|
||||
/** A Scorer for OR like queries, counterpart of Lucene's <code>ConjunctionScorer</code>.
|
||||
/** A Scorer for OR like queries, counterpart of <code>ConjunctionScorer</code>.
|
||||
* This Scorer implements {@link Scorer#skipTo(int)} and uses skipTo() on the given Scorers.
|
||||
* @todo Implement score(HitCollector, int).
|
||||
*/
|
||||
class DisjunctionSumScorer extends Scorer {
|
||||
/** The number of subscorers. */
|
||||
|
@ -35,19 +36,20 @@ class DisjunctionSumScorer extends Scorer {
|
|||
/** The minimum number of scorers that should match. */
|
||||
private final int minimumNrMatchers;
|
||||
|
||||
/** The scorerQueue contains all subscorers ordered by their current doc(),
|
||||
/** The scorerDocQueue contains all subscorers ordered by their current doc(),
|
||||
* with the minimum at the top.
|
||||
* <br>The scorerQueue is initialized the first time next() or skipTo() is called.
|
||||
* <br>An exhausted scorer is immediately removed from the scorerQueue.
|
||||
* <br>The scorerDocQueue is initialized the first time next() or skipTo() is called.
|
||||
* <br>An exhausted scorer is immediately removed from the scorerDocQueue.
|
||||
* <br>If less than the minimumNrMatchers scorers
|
||||
* remain in the scorerQueue next() and skipTo() return false.
|
||||
* remain in the scorerDocQueue next() and skipTo() return false.
|
||||
* <p>
|
||||
* After each to call to next() or skipTo()
|
||||
* <code>currentSumScore</code> is the total score of the current matching doc,
|
||||
* <code>nrMatchers</code> is the number of matching scorers,
|
||||
* and all scorers are after the matching doc, or are exhausted.
|
||||
*/
|
||||
private ScorerQueue scorerQueue = null;
|
||||
private ScorerDocQueue scorerDocQueue = null;
|
||||
private int queueSize = -1; // used to avoid size() method calls on scorerDocQueue
|
||||
|
||||
/** The document number of the current match. */
|
||||
private int currentDoc = -1;
|
||||
|
@ -91,47 +93,65 @@ class DisjunctionSumScorer extends Scorer {
|
|||
}
|
||||
|
||||
/** Called the first time next() or skipTo() is called to
|
||||
* initialize <code>scorerQueue</code>.
|
||||
* initialize <code>scorerDocQueue</code>.
|
||||
*/
|
||||
private void initScorerQueue() throws IOException {
|
||||
private void initScorerDocQueue() throws IOException {
|
||||
Iterator si = subScorers.iterator();
|
||||
scorerQueue = new ScorerQueue(nrScorers);
|
||||
scorerDocQueue = new ScorerDocQueue(nrScorers);
|
||||
queueSize = 0;
|
||||
while (si.hasNext()) {
|
||||
Scorer se = (Scorer) si.next();
|
||||
if (se.next()) { // doc() method will be used in scorerQueue.
|
||||
scorerQueue.insert(se);
|
||||
if (se.next()) { // doc() method will be used in scorerDocQueue.
|
||||
if (scorerDocQueue.insert(se)) {
|
||||
queueSize++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** A <code>PriorityQueue</code> that orders by {@link Scorer#doc()}. */
|
||||
private class ScorerQueue extends PriorityQueue {
|
||||
ScorerQueue(int size) {
|
||||
initialize(size);
|
||||
/** Scores and collects all matching documents.
|
||||
* @param hc The collector to which all matching documents are passed through
|
||||
* {@link HitCollector#collect(int, float)}.
|
||||
* <br>When this method is used the {@link #explain(int)} method should not be used.
|
||||
*/
|
||||
public void score(HitCollector hc) throws IOException {
|
||||
while (next()) {
|
||||
hc.collect(currentDoc, currentScore);
|
||||
}
|
||||
}
|
||||
|
||||
protected boolean lessThan(Object o1, Object o2) {
|
||||
return ((Scorer)o1).doc() < ((Scorer)o2).doc();
|
||||
/** Expert: Collects matching documents in a range. Hook for optimization.
|
||||
* Note that {@link #next()} must be called once before this method is called
|
||||
* for the first time.
|
||||
* @param hc The collector to which all matching documents are passed through
|
||||
* {@link HitCollector#collect(int, float)}.
|
||||
* @param max Do not score documents past this.
|
||||
* @return true if more matching documents may remain.
|
||||
*/
|
||||
protected boolean score(HitCollector hc, int max) throws IOException {
|
||||
while (currentDoc < max) {
|
||||
hc.collect(currentDoc, currentScore);
|
||||
if (!next()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean next() throws IOException {
|
||||
if (scorerQueue == null) {
|
||||
initScorerQueue();
|
||||
}
|
||||
if (scorerQueue.size() < minimumNrMatchers) {
|
||||
return false;
|
||||
} else {
|
||||
return advanceAfterCurrent();
|
||||
if (scorerDocQueue == null) {
|
||||
initScorerDocQueue();
|
||||
}
|
||||
return (scorerDocQueue.size() >= minimumNrMatchers)
|
||||
&& advanceAfterCurrent();
|
||||
}
|
||||
|
||||
|
||||
/** Advance all subscorers after the current document determined by the
|
||||
* top of the <code>scorerQueue</code>.
|
||||
* top of the <code>scorerDocQueue</code>.
|
||||
* Repeat until at least the minimum number of subscorers match on the same
|
||||
* document and all subscorers are after that document or are exhausted.
|
||||
* <br>On entry the <code>scorerQueue</code> has at least <code>minimumNrMatchers</code>
|
||||
* <br>On entry the <code>scorerDocQueue</code> has at least <code>minimumNrMatchers</code>
|
||||
* available. At least the scorer with the minimum document number will be advanced.
|
||||
* @return true iff there is a match.
|
||||
* <br>In case there is a match, </code>currentDoc</code>, </code>currentSumScore</code>,
|
||||
|
@ -140,39 +160,32 @@ class DisjunctionSumScorer extends Scorer {
|
|||
* @todo Investigate whether it is possible to use skipTo() when
|
||||
* the minimum number of matchers is bigger than one, ie. try and use the
|
||||
* character of ConjunctionScorer for the minimum number of matchers.
|
||||
* Also delay calling score() on the sub scorers until the minimum number of
|
||||
* matchers is reached.
|
||||
* <br>For this, a Scorer array with minimumNrMatchers elements might
|
||||
* hold Scorers at currentDoc that are temporarily popped from scorerQueue.
|
||||
*/
|
||||
protected boolean advanceAfterCurrent() throws IOException {
|
||||
do { // repeat until minimum nr of matchers
|
||||
Scorer top = (Scorer) scorerQueue.top();
|
||||
currentDoc = top.doc();
|
||||
currentScore = top.score();
|
||||
currentDoc = scorerDocQueue.topDoc();
|
||||
currentScore = scorerDocQueue.topScore();
|
||||
nrMatchers = 1;
|
||||
do { // Until all subscorers are after currentDoc
|
||||
if (top.next()) {
|
||||
scorerQueue.adjustTop();
|
||||
} else {
|
||||
scorerQueue.pop();
|
||||
if (scorerQueue.size() < (minimumNrMatchers - nrMatchers)) {
|
||||
// Not enough subscorers left for a match on this document,
|
||||
// and also no more chance of any further match.
|
||||
return false;
|
||||
}
|
||||
if (scorerQueue.size() == 0) {
|
||||
if (! scorerDocQueue.topNextAndAdjustElsePop()) {
|
||||
if (--queueSize == 0) {
|
||||
break; // nothing more to advance, check for last match.
|
||||
}
|
||||
}
|
||||
top = (Scorer) scorerQueue.top();
|
||||
if (top.doc() != currentDoc) {
|
||||
if (scorerDocQueue.topDoc() != currentDoc) {
|
||||
break; // All remaining subscorers are after currentDoc.
|
||||
} else {
|
||||
currentScore += top.score();
|
||||
nrMatchers++;
|
||||
}
|
||||
currentScore += scorerDocQueue.topScore();
|
||||
nrMatchers++;
|
||||
} while (true);
|
||||
|
||||
if (nrMatchers >= minimumNrMatchers) {
|
||||
return true;
|
||||
} else if (scorerQueue.size() < minimumNrMatchers) {
|
||||
} else if (queueSize < minimumNrMatchers) {
|
||||
return false;
|
||||
}
|
||||
} while (true);
|
||||
|
@ -200,39 +213,49 @@ class DisjunctionSumScorer extends Scorer {
|
|||
* @return true iff there is such a match.
|
||||
*/
|
||||
public boolean skipTo(int target) throws IOException {
|
||||
if (scorerQueue == null) {
|
||||
initScorerQueue();
|
||||
if (scorerDocQueue == null) {
|
||||
initScorerDocQueue();
|
||||
}
|
||||
if (scorerQueue.size() < minimumNrMatchers) {
|
||||
if (queueSize < minimumNrMatchers) {
|
||||
return false;
|
||||
}
|
||||
if (target <= currentDoc) {
|
||||
return true;
|
||||
}
|
||||
do {
|
||||
Scorer top = (Scorer) scorerQueue.top();
|
||||
if (top.doc() >= target) {
|
||||
if (scorerDocQueue.topDoc() >= target) {
|
||||
return advanceAfterCurrent();
|
||||
} else if (top.skipTo(target)) {
|
||||
scorerQueue.adjustTop();
|
||||
} else {
|
||||
scorerQueue.pop();
|
||||
if (scorerQueue.size() < minimumNrMatchers) {
|
||||
} else if (! scorerDocQueue.topSkipToAndAdjustElsePop(target)) {
|
||||
if (--queueSize < minimumNrMatchers) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} while (true);
|
||||
}
|
||||
|
||||
/** Gives and explanation for the score of a given document.
|
||||
* @todo Show the resulting score. See BooleanScorer.explain() on how to do this.
|
||||
*/
|
||||
/** @return An explanation for the score of a given document. */
|
||||
public Explanation explain(int doc) throws IOException {
|
||||
Explanation res = new Explanation();
|
||||
res.setDescription("At least " + minimumNrMatchers + " of");
|
||||
Iterator ssi = subScorers.iterator();
|
||||
float sumScore = 0.0f;
|
||||
int nrMatches = 0;
|
||||
while (ssi.hasNext()) {
|
||||
res.addDetail( ((Scorer) ssi.next()).explain(doc));
|
||||
Explanation es = ((Scorer) ssi.next()).explain(doc);
|
||||
if (es.getValue() > 0.0f) { // indicates match
|
||||
sumScore += es.getValue();
|
||||
nrMatches++;
|
||||
}
|
||||
res.addDetail(es);
|
||||
}
|
||||
if (nrMatchers >= minimumNrMatchers) {
|
||||
res.setValue(sumScore);
|
||||
res.setDescription("sum over at least " + minimumNrMatchers
|
||||
+ " of " + subScorers.size() + ":");
|
||||
} else {
|
||||
res.setValue(0.0f);
|
||||
res.setDescription(nrMatches + " match(es) but at least "
|
||||
+ minimumNrMatchers + " of "
|
||||
+ subScorers.size() + " needed");
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,214 @@
|
|||
package org.apache.lucene.util;
|
||||
|
||||
/**
|
||||
* Copyright 2006 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* Derived from org.apache.lucene.util.PriorityQueue of March 2005 */
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
|
||||
/** A ScorerDocQueue maintains a partial ordering of its Scorers such that the
|
||||
least Scorer can always be found in constant time. Put()'s and pop()'s
|
||||
require log(size) time. The ordering is by Scorer.doc().
|
||||
*/
|
||||
public class ScorerDocQueue { // later: SpansQueue for spans with doc and term positions
|
||||
private final HeapedScorerDoc[] heap;
|
||||
private final int maxSize;
|
||||
private int size;
|
||||
|
||||
private class HeapedScorerDoc {
|
||||
Scorer scorer;
|
||||
int doc;
|
||||
|
||||
HeapedScorerDoc(Scorer s) { this(s, s.doc()); }
|
||||
|
||||
HeapedScorerDoc(Scorer scorer, int doc) {
|
||||
this.scorer = scorer;
|
||||
this.doc = doc;
|
||||
}
|
||||
|
||||
void adjust() { doc = scorer.doc(); }
|
||||
}
|
||||
|
||||
private HeapedScorerDoc topHSD; // same as heap[1], only for speed
|
||||
|
||||
/** Create a ScorerDocQueue with a maximum size. */
|
||||
public ScorerDocQueue(int maxSize) {
|
||||
// assert maxSize >= 0;
|
||||
size = 0;
|
||||
int heapSize = maxSize + 1;
|
||||
heap = new HeapedScorerDoc[heapSize];
|
||||
this.maxSize = maxSize;
|
||||
topHSD = heap[1]; // initially null
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a Scorer to a ScorerDocQueue in log(size) time.
|
||||
* If one tries to add more Scorers than maxSize
|
||||
* a RuntimeException (ArrayIndexOutOfBound) is thrown.
|
||||
*/
|
||||
public final void put(Scorer scorer) {
|
||||
size++;
|
||||
heap[size] = new HeapedScorerDoc(scorer);
|
||||
upHeap();
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a Scorer to the ScorerDocQueue in log(size) time if either
|
||||
* the ScorerDocQueue is not full, or not lessThan(scorer, top()).
|
||||
* @param scorer
|
||||
* @return true if scorer is added, false otherwise.
|
||||
*/
|
||||
public boolean insert(Scorer scorer){
|
||||
if (size < maxSize) {
|
||||
put(scorer);
|
||||
return true;
|
||||
} else {
|
||||
int docNr = scorer.doc();
|
||||
if ((size > 0) && (! (docNr < topHSD.doc))) { // heap[1] is top()
|
||||
heap[1] = new HeapedScorerDoc(scorer, docNr);
|
||||
downHeap();
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns the least Scorer of the ScorerDocQueue in constant time.
|
||||
* Should not be used when the queue is empty.
|
||||
*/
|
||||
public final Scorer top() {
|
||||
// assert size > 0;
|
||||
return topHSD.scorer;
|
||||
}
|
||||
|
||||
/** Returns document number of the least Scorer of the ScorerDocQueue
|
||||
* in constant time.
|
||||
* Should not be used when the queue is empty.
|
||||
*/
|
||||
public final int topDoc() {
|
||||
// assert size > 0;
|
||||
return topHSD.doc;
|
||||
}
|
||||
|
||||
public final float topScore() throws IOException {
|
||||
// assert size > 0;
|
||||
return topHSD.scorer.score();
|
||||
}
|
||||
|
||||
public final boolean topNextAndAdjustElsePop() throws IOException {
|
||||
return checkAdjustElsePop( topHSD.scorer.next());
|
||||
}
|
||||
|
||||
public final boolean topSkipToAndAdjustElsePop(int target) throws IOException {
|
||||
return checkAdjustElsePop( topHSD.scorer.skipTo(target));
|
||||
}
|
||||
|
||||
private boolean checkAdjustElsePop(boolean cond) {
|
||||
if (cond) { // see also adjustTop
|
||||
topHSD.doc = topHSD.scorer.doc();
|
||||
} else { // see also popNoResult
|
||||
heap[1] = heap[size]; // move last to first
|
||||
heap[size] = null;
|
||||
size--;
|
||||
}
|
||||
downHeap();
|
||||
return cond;
|
||||
}
|
||||
|
||||
/** Removes and returns the least scorer of the ScorerDocQueue in log(size)
|
||||
* time.
|
||||
* Should not be used when the queue is empty.
|
||||
*/
|
||||
public final Scorer pop() {
|
||||
// assert size > 0;
|
||||
Scorer result = topHSD.scorer;
|
||||
popNoResult();
|
||||
return result;
|
||||
}
|
||||
|
||||
/** Removes the least scorer of the ScorerDocQueue in log(size) time.
|
||||
* Should not be used when the queue is empty.
|
||||
*/
|
||||
private final void popNoResult() {
|
||||
heap[1] = heap[size]; // move last to first
|
||||
heap[size] = null;
|
||||
size--;
|
||||
downHeap(); // adjust heap
|
||||
}
|
||||
|
||||
/** Should be called when the scorer at top changes doc() value.
|
||||
* Still log(n) worst case, but it's at least twice as fast to <pre>
|
||||
* { pq.top().change(); pq.adjustTop(); }
|
||||
* </pre> instead of <pre>
|
||||
* { o = pq.pop(); o.change(); pq.push(o); }
|
||||
* </pre>
|
||||
*/
|
||||
public final void adjustTop() {
|
||||
// assert size > 0;
|
||||
topHSD.adjust();
|
||||
downHeap();
|
||||
}
|
||||
|
||||
/** Returns the number of scorers currently stored in the ScorerDocQueue. */
|
||||
public final int size() {
|
||||
return size;
|
||||
}
|
||||
|
||||
/** Removes all entries from the ScorerDocQueue. */
|
||||
public final void clear() {
|
||||
for (int i = 0; i <= size; i++) {
|
||||
heap[i] = null;
|
||||
}
|
||||
size = 0;
|
||||
}
|
||||
|
||||
private final void upHeap() {
|
||||
int i = size;
|
||||
HeapedScorerDoc node = heap[i]; // save bottom node
|
||||
int j = i >>> 1;
|
||||
while ((j > 0) && (node.doc < heap[j].doc)) {
|
||||
heap[i] = heap[j]; // shift parents down
|
||||
i = j;
|
||||
j = j >>> 1;
|
||||
}
|
||||
heap[i] = node; // install saved node
|
||||
topHSD = heap[1];
|
||||
}
|
||||
|
||||
private final void downHeap() {
|
||||
int i = 1;
|
||||
HeapedScorerDoc node = heap[i]; // save top node
|
||||
int j = i << 1; // find smaller child
|
||||
int k = j + 1;
|
||||
if ((k <= size) && (heap[k].doc < heap[j].doc)) {
|
||||
j = k;
|
||||
}
|
||||
while ((j <= size) && (heap[j].doc < node.doc)) {
|
||||
heap[i] = heap[j]; // shift up child
|
||||
i = j;
|
||||
j = i << 1;
|
||||
k = j + 1;
|
||||
if (k <= size && (heap[k].doc < heap[j].doc)) {
|
||||
j = k;
|
||||
}
|
||||
}
|
||||
heap[i] = node; // install saved node
|
||||
topHSD = heap[1];
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue