DisjunctionSumScorer performance improvement: LUCENE-365

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@465114 13f79535-47bb-0310-9956-ffa450edef68
2006-10-18 00:56:08 +00:00 · 2006-10-18 00:56:08 +00:00 · 901c8c379c
parent b75358ebc9
commit 901c8c379c
3 changed files with 304 additions and 64 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -169,6 +169,9 @@ Optimizations
     any BooleanQuery with more than one mandatory clause.
     (Abdul Chaudhry, Paul Elschot via Yonik Seeley)

+  9. LUCENE-365: DisjunctionSumScorer performance increase of ~30%. Speeds up
+     queries with optional clauses. (Paul Elschot via Yonik Seeley)
+
 Test Cases
  1. Added TestTermScorer.java (Grant Ingersoll)

--- a/src/java/org/apache/lucene/search/DisjunctionSumScorer.java
+++ b/src/java/org/apache/lucene/search/DisjunctionSumScorer.java
@ -20,10 +20,11 @@ import java.util.List;
 import java.util.Iterator;
 import java.io.IOException;

-import org.apache.lucene.util.PriorityQueue;
+import org.apache.lucene.util.ScorerDocQueue;

-/** A Scorer for OR like queries, counterpart of Lucene's <code>ConjunctionScorer</code>.
+/** A Scorer for OR like queries, counterpart of <code>ConjunctionScorer</code>.
 * This Scorer implements {@link Scorer#skipTo(int)} and uses skipTo() on the given Scorers. 
+ * @todo Implement score(HitCollector, int).
 */
 class DisjunctionSumScorer extends Scorer {
  /** The number of subscorers. */ 
@ -35,19 +36,20 @@ class DisjunctionSumScorer extends Scorer {
  /** The minimum number of scorers that should match. */
  private final int minimumNrMatchers;
  
-  /** The scorerQueue contains all subscorers ordered by their current doc(),
+  /** The scorerDocQueue contains all subscorers ordered by their current doc(),
   * with the minimum at the top.
-   * <br>The scorerQueue is initialized the first time next() or skipTo() is called.
-   * <br>An exhausted scorer is immediately removed from the scorerQueue.
+   * <br>The scorerDocQueue is initialized the first time next() or skipTo() is called.
+   * <br>An exhausted scorer is immediately removed from the scorerDocQueue.
   * <br>If less than the minimumNrMatchers scorers
-   * remain in the scorerQueue next() and skipTo() return false.
+   * remain in the scorerDocQueue next() and skipTo() return false.
   * <p>
   * After each to call to next() or skipTo()
   * <code>currentSumScore</code> is the total score of the current matching doc,
   * <code>nrMatchers</code> is the number of matching scorers,
   * and all scorers are after the matching doc, or are exhausted.
   */
-  private ScorerQueue scorerQueue = null;
+  private ScorerDocQueue scorerDocQueue = null;
+  private int queueSize = -1; // used to avoid size() method calls on scorerDocQueue
  
  /** The document number of the current match. */
  private int currentDoc = -1;
@ -91,47 +93,65 @@ class DisjunctionSumScorer extends Scorer {
  }

  /** Called the first time next() or skipTo() is called to
-   * initialize <code>scorerQueue</code>.
+   * initialize <code>scorerDocQueue</code>.
   */
-  private void initScorerQueue() throws IOException {
+  private void initScorerDocQueue() throws IOException {
    Iterator si = subScorers.iterator();
-    scorerQueue = new ScorerQueue(nrScorers);
+    scorerDocQueue = new ScorerDocQueue(nrScorers);
+    queueSize = 0;
    while (si.hasNext()) {
      Scorer se = (Scorer) si.next();
-      if (se.next()) { // doc() method will be used in scorerQueue.
-        scorerQueue.insert(se);
+      if (se.next()) { // doc() method will be used in scorerDocQueue.
+        if (scorerDocQueue.insert(se)) {
+          queueSize++;
+        }
      }
    }
  }

-  /** A <code>PriorityQueue</code> that orders by {@link Scorer#doc()}. */
-  private class ScorerQueue extends PriorityQueue {
-    ScorerQueue(int size) {
-      initialize(size);
-    }
-
-    protected boolean lessThan(Object o1, Object o2) {
-      return ((Scorer)o1).doc() < ((Scorer)o2).doc();
+  /** Scores and collects all matching documents.
+   * @param hc The collector to which all matching documents are passed through
+   * {@link HitCollector#collect(int, float)}.
+   * <br>When this method is used the {@link #explain(int)} method should not be used.
+   */
+  public void score(HitCollector hc) throws IOException {
+    while (next()) {
+      hc.collect(currentDoc, currentScore);
    }
  }
-  
+
+  /** Expert: Collects matching documents in a range.  Hook for optimization.
+   * Note that {@link #next()} must be called once before this method is called
+   * for the first time.
+   * @param hc The collector to which all matching documents are passed through
+   * {@link HitCollector#collect(int, float)}.
+   * @param max Do not score documents past this.
+   * @return true if more matching documents may remain.
+   */
+  protected boolean score(HitCollector hc, int max) throws IOException {
+    while (currentDoc < max) {
+      hc.collect(currentDoc, currentScore);
+      if (!next()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
  public boolean next() throws IOException {
-    if (scorerQueue == null) {
-      initScorerQueue();
-    }
-    if (scorerQueue.size() < minimumNrMatchers) {
-      return false;
-    } else {
-      return advanceAfterCurrent();
+    if (scorerDocQueue == null) {
+      initScorerDocQueue();
    }
+    return (scorerDocQueue.size() >= minimumNrMatchers)
+          && advanceAfterCurrent();
  }


  /** Advance all subscorers after the current document determined by the
-   * top of the <code>scorerQueue</code>.
+   * top of the <code>scorerDocQueue</code>.
   * Repeat until at least the minimum number of subscorers match on the same
   * document and all subscorers are after that document or are exhausted.
-   * <br>On entry the <code>scorerQueue</code> has at least <code>minimumNrMatchers</code>
+   * <br>On entry the <code>scorerDocQueue</code> has at least <code>minimumNrMatchers</code>
   * available. At least the scorer with the minimum document number will be advanced.
   * @return true iff there is a match.
   * <br>In case there is a match, </code>currentDoc</code>, </code>currentSumScore</code>,
@ -140,39 +160,32 @@ class DisjunctionSumScorer extends Scorer {
   * @todo Investigate whether it is possible to use skipTo() when
   * the minimum number of matchers is bigger than one, ie. try and use the
   * character of ConjunctionScorer for the minimum number of matchers.
+   * Also delay calling score() on the sub scorers until the minimum number of
+   * matchers is reached.
+   * <br>For this, a Scorer array with minimumNrMatchers elements might
+   * hold Scorers at currentDoc that are temporarily popped from scorerQueue.
   */
  protected boolean advanceAfterCurrent() throws IOException {
    do { // repeat until minimum nr of matchers
-      Scorer top = (Scorer) scorerQueue.top();
-      currentDoc = top.doc();
-      currentScore = top.score();
+      currentDoc = scorerDocQueue.topDoc();
+      currentScore = scorerDocQueue.topScore();
      nrMatchers = 1;
      do { // Until all subscorers are after currentDoc
-        if (top.next()) {
-          scorerQueue.adjustTop();
-        } else {
-          scorerQueue.pop();
-          if (scorerQueue.size() < (minimumNrMatchers - nrMatchers)) {
-            // Not enough subscorers left for a match on this document,
-            // and also no more chance of any further match.
-            return false;
-          }
-          if (scorerQueue.size() == 0) {
+        if (! scorerDocQueue.topNextAndAdjustElsePop()) {
+          if (--queueSize == 0) {
            break; // nothing more to advance, check for last match.
          }
        }
-        top = (Scorer) scorerQueue.top();
-        if (top.doc() != currentDoc) {
+        if (scorerDocQueue.topDoc() != currentDoc) {
          break; // All remaining subscorers are after currentDoc.
-        } else {
-          currentScore += top.score();
-          nrMatchers++;
        }
+        currentScore += scorerDocQueue.topScore();
+        nrMatchers++;
      } while (true);
      
      if (nrMatchers >= minimumNrMatchers) {
        return true;
-      } else if (scorerQueue.size() < minimumNrMatchers) {
+      } else if (queueSize < minimumNrMatchers) {
        return false;
      }
    } while (true);
@ -200,39 +213,49 @@ class DisjunctionSumScorer extends Scorer {
   * @return true iff there is such a match.
   */
  public boolean skipTo(int target) throws IOException {
-    if (scorerQueue == null) {
-      initScorerQueue();
+    if (scorerDocQueue == null) {
+      initScorerDocQueue();
    }
-    if (scorerQueue.size() < minimumNrMatchers) {
+    if (queueSize < minimumNrMatchers) {
      return false;
    }
    if (target <= currentDoc) {
      return true;
    }
    do {
-      Scorer top = (Scorer) scorerQueue.top();
-      if (top.doc() >= target) {
+      if (scorerDocQueue.topDoc() >= target) {
        return advanceAfterCurrent();
-      } else if (top.skipTo(target)) {
-        scorerQueue.adjustTop();
-      } else {
-        scorerQueue.pop();
-        if (scorerQueue.size() < minimumNrMatchers) {
+      } else if (! scorerDocQueue.topSkipToAndAdjustElsePop(target)) {
+        if (--queueSize < minimumNrMatchers) {
          return false;
        }
      }
    } while (true);
  }

- /** Gives and explanation for the score of a given document.
-  * @todo Show the resulting score. See BooleanScorer.explain() on how to do this.
-  */
+  /** @return An explanation for the score of a given document. */
  public Explanation explain(int doc) throws IOException {
    Explanation res = new Explanation();
-    res.setDescription("At least " + minimumNrMatchers + " of");
    Iterator ssi = subScorers.iterator();
+    float sumScore = 0.0f;
+    int nrMatches = 0;
    while (ssi.hasNext()) {
-      res.addDetail( ((Scorer) ssi.next()).explain(doc));
+      Explanation es = ((Scorer) ssi.next()).explain(doc);
+      if (es.getValue() > 0.0f) { // indicates match
+        sumScore += es.getValue();
+        nrMatches++;
+      }
+      res.addDetail(es);
+    }
+    if (nrMatchers >= minimumNrMatchers) {
+      res.setValue(sumScore);
+      res.setDescription("sum over at least " + minimumNrMatchers
+                         + " of " + subScorers.size() + ":");
+    } else {
+      res.setValue(0.0f);
+      res.setDescription(nrMatches + " match(es) but at least "
+                         + minimumNrMatchers + " of "
+                         + subScorers.size() + " needed");
    }
    return res;
  }
--- a/src/java/org/apache/lucene/util/ScorerDocQueue.java
+++ b/src/java/org/apache/lucene/util/ScorerDocQueue.java
@ -0,0 +1,214 @@
+package org.apache.lucene.util;
+
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Derived from org.apache.lucene.util.PriorityQueue of March 2005 */
+
+import java.io.IOException;
+import org.apache.lucene.search.Scorer;
+
+/** A ScorerDocQueue maintains a partial ordering of its Scorers such that the
+  least Scorer can always be found in constant time.  Put()'s and pop()'s
+  require log(size) time. The ordering is by Scorer.doc().
+ */
+public class ScorerDocQueue {  // later: SpansQueue for spans with doc and term positions
+  private final HeapedScorerDoc[] heap;
+  private final int maxSize;
+  private int size;
+  
+  private class HeapedScorerDoc {
+    Scorer scorer;
+    int doc;
+    
+    HeapedScorerDoc(Scorer s) { this(s, s.doc()); }
+    
+    HeapedScorerDoc(Scorer scorer, int doc) {
+      this.scorer = scorer;
+      this.doc = doc;
+    }
+    
+    void adjust() { doc = scorer.doc(); }
+  }
+  
+  private HeapedScorerDoc topHSD; // same as heap[1], only for speed
+
+  /** Create a ScorerDocQueue with a maximum size. */
+  public ScorerDocQueue(int maxSize) {
+    // assert maxSize >= 0;
+    size = 0;
+    int heapSize = maxSize + 1;
+    heap = new HeapedScorerDoc[heapSize];
+    this.maxSize = maxSize;
+    topHSD = heap[1]; // initially null
+  }
+
+  /**
+   * Adds a Scorer to a ScorerDocQueue in log(size) time.
+   * If one tries to add more Scorers than maxSize
+   * a RuntimeException (ArrayIndexOutOfBound) is thrown.
+   */
+  public final void put(Scorer scorer) {
+    size++;
+    heap[size] = new HeapedScorerDoc(scorer);
+    upHeap();
+  }
+
+  /**
+   * Adds a Scorer to the ScorerDocQueue in log(size) time if either
+   * the ScorerDocQueue is not full, or not lessThan(scorer, top()).
+   * @param scorer
+   * @return true if scorer is added, false otherwise.
+   */
+  public boolean insert(Scorer scorer){
+    if (size < maxSize) {
+      put(scorer);
+      return true;
+    } else {
+      int docNr = scorer.doc();
+      if ((size > 0) && (! (docNr < topHSD.doc))) { // heap[1] is top()
+        heap[1] = new HeapedScorerDoc(scorer, docNr);
+        downHeap();
+        return true;
+      } else {
+        return false;
+      }
+    }
+   }
+
+  /** Returns the least Scorer of the ScorerDocQueue in constant time.
+   * Should not be used when the queue is empty.
+   */
+  public final Scorer top() {
+    // assert size > 0;
+    return topHSD.scorer;
+  }
+
+  /** Returns document number of the least Scorer of the ScorerDocQueue
+   * in constant time.
+   * Should not be used when the queue is empty.
+   */
+  public final int topDoc() {
+    // assert size > 0;
+    return topHSD.doc;
+  }
+  
+  public final float topScore() throws IOException {
+    // assert size > 0;
+    return topHSD.scorer.score();
+  }
+
+  public final boolean topNextAndAdjustElsePop() throws IOException {
+    return checkAdjustElsePop( topHSD.scorer.next());
+  }
+
+  public final boolean topSkipToAndAdjustElsePop(int target) throws IOException {
+    return checkAdjustElsePop( topHSD.scorer.skipTo(target));
+  }
+  
+  private boolean checkAdjustElsePop(boolean cond) {
+    if (cond) { // see also adjustTop
+      topHSD.doc = topHSD.scorer.doc();
+    } else { // see also popNoResult
+      heap[1] = heap[size]; // move last to first
+      heap[size] = null;
+      size--;
+    }
+    downHeap();
+    return cond;
+  }
+
+  /** Removes and returns the least scorer of the ScorerDocQueue in log(size)
+   * time.
+   * Should not be used when the queue is empty.
+   */
+  public final Scorer pop() {
+    // assert size > 0;
+    Scorer result = topHSD.scorer;
+    popNoResult();
+    return result;
+  }
+  
+  /** Removes the least scorer of the ScorerDocQueue in log(size) time.
+   * Should not be used when the queue is empty.
+   */
+  private final void popNoResult() {
+    heap[1] = heap[size]; // move last to first
+    heap[size] = null;
+    size--;
+    downHeap();	// adjust heap
+  }
+
+  /** Should be called when the scorer at top changes doc() value.
+   * Still log(n) worst case, but it's at least twice as fast to <pre>
+   *  { pq.top().change(); pq.adjustTop(); }
+   * </pre> instead of <pre>
+   *  { o = pq.pop(); o.change(); pq.push(o); }
+   * </pre>
+   */
+  public final void adjustTop() {
+    // assert size > 0;
+    topHSD.adjust();
+    downHeap();
+  }
+
+  /** Returns the number of scorers currently stored in the ScorerDocQueue. */
+  public final int size() {
+    return size;
+  }
+
+  /** Removes all entries from the ScorerDocQueue. */
+  public final void clear() {
+    for (int i = 0; i <= size; i++) {
+      heap[i] = null;
+    }
+    size = 0;
+  }
+
+  private final void upHeap() {
+    int i = size;
+    HeapedScorerDoc node = heap[i];		  // save bottom node
+    int j = i >>> 1;
+    while ((j > 0) && (node.doc < heap[j].doc)) {
+      heap[i] = heap[j];			  // shift parents down
+      i = j;
+      j = j >>> 1;
+    }
+    heap[i] = node;				  // install saved node
+    topHSD = heap[1];
+  }
+
+  private final void downHeap() {
+    int i = 1;
+    HeapedScorerDoc node = heap[i];	          // save top node
+    int j = i << 1;				  // find smaller child
+    int k = j + 1;
+    if ((k <= size) && (heap[k].doc < heap[j].doc)) {
+      j = k;
+    }
+    while ((j <= size) && (heap[j].doc < node.doc)) {
+      heap[i] = heap[j];			  // shift up child
+      i = j;
+      j = i << 1;
+      k = j + 1;
+      if (k <= size && (heap[k].doc < heap[j].doc)) {
+	j = k;
+      }
+    }
+    heap[i] = node;				  // install saved node
+    topHSD = heap[1];
+  }
+}