LUCENE-6421: revert

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1673577 13f79535-47bb-0310-9956-ffa450edef68
2015-04-14 23:02:53 +00:00 · 2015-04-14 23:02:53 +00:00 · 0aa98d464c
parent d185168be9
commit 0aa98d464c
3 changed files with 203 additions and 173 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -77,9 +77,6 @@ Optimizations
 * LUCENE-6388: Optimize SpanNearQuery when payloads are not present.
  (Robert Muir)

-* LUCENE-6421: Defer reading of positions in MultiPhraseQuery until
-  they are needed. (Robert Muir)
-
 Bug Fixes

 * LUCENE-6378: Fix all RuntimeExceptions to throw the underlying root cause.
--- a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
@ -200,30 +200,45 @@ public class MultiPhraseQuery extends Query {

      for (int pos=0; pos<postingsFreqs.length; pos++) {
        Term[] terms = termArrays.get(pos);
-        List<PostingsEnum> postings = new ArrayList<>();

-        for (Term term : terms) {
+        final PostingsEnum postingsEnum;
+        int docFreq;
+
+        if (terms.length > 1) {
+          postingsEnum = new UnionPostingsEnum(liveDocs, context, terms, termContexts, termsEnum);
+
+          // coarse -- this overcounts since a given doc can
+          // have more than one term:
+          docFreq = 0;
+          for(int termIdx=0;termIdx<terms.length;termIdx++) {
+            final Term term = terms[termIdx];
+            TermState termState = termContexts.get(term).get(context.ord);
+            if (termState == null) {
+              // Term not in reader
+              continue;
+            }
+            termsEnum.seekExact(term.bytes(), termState);
+            docFreq += termsEnum.docFreq();
+          }
+
+          if (docFreq == 0) {
+            // None of the terms are in this reader
+            return null;
+          }
+        } else {
+          final Term term = terms[0];
          TermState termState = termContexts.get(term).get(context.ord);
          if (termState == null) {
            // Term not in reader
            return null;
          }
          termsEnum.seekExact(term.bytes(), termState);
-          postings.add(termsEnum.postings(liveDocs, null, PostingsEnum.POSITIONS));
+          postingsEnum = termsEnum.postings(liveDocs, null, PostingsEnum.POSITIONS);
+
+          docFreq = termsEnum.docFreq();
        }

-        if (postings.isEmpty()) {
-          return null;
-        }
-        
-        final PostingsEnum postingsEnum;
-        if (postings.size() == 1) {
-          postingsEnum = postings.get(0);
-        } else {
-          postingsEnum = new UnionPostingsEnum(postings);
-        }
-
-        postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, positions.get(pos).intValue(), terms);
+        postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, positions.get(pos).intValue(), terms);
      }

      // sort by increasing docFreq order
@ -383,116 +398,26 @@ public class MultiPhraseQuery extends Query {
    }
    return true;
  }
-  
-  /** 
-   * Takes the logical union of multiple PostingsEnum iterators.
-   * <p>
-   * Note: positions are merged during freq()
-   */
-  static class UnionPostingsEnum extends PostingsEnum {
-    /** queue ordered by docid */
-    final DocsQueue docsQueue;
-    /** cost of this enum: sum of its subs */
-    final long cost;
-    
-    /** queue ordered by position for current doc */
-    final PositionsQueue posQueue = new PositionsQueue();
-    /** current doc posQueue is working */
-    int posQueueDoc = -2;
-    /** list of subs (unordered) */
-    final PostingsEnum[] subs;
-    
-    UnionPostingsEnum(Collection<PostingsEnum> subs) {
-      docsQueue = new DocsQueue(subs.size());
-      long cost = 0;
-      for (PostingsEnum sub : subs) {
-        docsQueue.add(sub);
-        cost += sub.cost();
-      }
-      this.cost = cost;
-      this.subs = subs.toArray(new PostingsEnum[subs.size()]);
-    }
-
-    @Override
-    public int freq() throws IOException {
-      int doc = docID();
-      if (doc != posQueueDoc) {
-        posQueue.clear();
-        for (PostingsEnum sub : subs) {
-          if (sub.docID() == doc) {
-            int freq = sub.freq();
-            for (int i = 0; i < freq; i++) {
-              posQueue.add(sub.nextPosition());
-            }
-          }
-        }
-        posQueue.sort();
-        posQueueDoc = doc;
-      }
-      return posQueue.size();
-    }
-
-    @Override
-    public int nextPosition() throws IOException {
-      return posQueue.next();
-    }
-
-    @Override
-    public int docID() {
-      return docsQueue.top().docID();
-    }
-
-    @Override
-    public int nextDoc() throws IOException {
-      PostingsEnum top = docsQueue.top();
-      int doc = top.docID();
-      
-      do {
-        top.nextDoc();
-        top = docsQueue.updateTop();
-      } while (top.docID() == doc);
-
-      return top.docID();
-    }
-
-    @Override
-    public int advance(int target) throws IOException {
-      PostingsEnum top = docsQueue.top();
-      
-      do {
-        top.advance(target);
-        top = docsQueue.updateTop();
-      } while (top.docID() < target);
-
-      return top.docID();
-    }
-
-    @Override
-    public long cost() {
-      return cost;
-    }
-    
-    @Override
-    public int startOffset() throws IOException {
-      return -1; // offsets are unsupported
-    }
-
-    @Override
-    public int endOffset() throws IOException {
-      return -1; // offsets are unsupported
-    }
-
-    @Override
-    public BytesRef getPayload() throws IOException {
-      return null; // payloads are unsupported
 }

 /**
-     * disjunction of postings ordered by docid.
+ * Takes the logical union of multiple DocsEnum iterators.
 */
-    static class DocsQueue extends PriorityQueue<PostingsEnum> {
-      DocsQueue(int size) {
-        super(size);
+
+// TODO: if ever we allow subclassing of the *PhraseScorer
+class UnionPostingsEnum extends PostingsEnum {
+
+  private static final class DocsQueue extends PriorityQueue<PostingsEnum> {
+    DocsQueue(List<PostingsEnum> postingsEnums) throws IOException {
+      super(postingsEnums.size());
+
+      Iterator<PostingsEnum> i = postingsEnums.iterator();
+      while (i.hasNext()) {
+        PostingsEnum postings = i.next();
+        if (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+          add(postings);
+        }
+      }
    }

    @Override
@ -501,46 +426,147 @@ public class MultiPhraseQuery extends Query {
    }
  }

-    /** 
-     * queue of terms for a single document. its a sorted array of
-     * all the positions from all the postings
-     */
-    static class PositionsQueue {
-      private int arraySize = 16;
-      private int index = 0;
-      private int size = 0;
-      private int[] array = new int[arraySize];
+  private static final class IntQueue {
+    private int _arraySize = 16;
+    private int _index = 0;
+    private int _lastIndex = 0;
+    private int[] _array = new int[_arraySize];
    
-      void add(int i) {
-        if (size == arraySize)
+    final void add(int i) {
+      if (_lastIndex == _arraySize)
        growArray();

-        array[size++] = i;
+      _array[_lastIndex++] = i;
    }

-      int next() {
-        return array[index++];
+    final int next() {
+      return _array[_index++];
    }

-      void sort() {
-        Arrays.sort(array, index, size);
+    final void sort() {
+      Arrays.sort(_array, _index, _lastIndex);
    }

-      void clear() {
-        index = 0;
-        size = 0;
+    final void clear() {
+      _index = 0;
+      _lastIndex = 0;
    }

-      int size() {
-        return size;
+    final int size() {
+      return (_lastIndex - _index);
    }

    private void growArray() {
-        int[] newArray = new int[arraySize * 2];
-        System.arraycopy(array, 0, newArray, 0, arraySize);
-        array = newArray;
-        arraySize *= 2;
+      int[] newArray = new int[_arraySize * 2];
+      System.arraycopy(_array, 0, newArray, 0, _arraySize);
+      _array = newArray;
+      _arraySize *= 2;
    }
  }
+
+  private int _doc = -1;
+  private int _freq;
+  private DocsQueue _queue;
+  private IntQueue _posList;
+  private long cost;
+
+  public UnionPostingsEnum(Bits liveDocs, LeafReaderContext context, Term[] terms, Map<Term, TermContext> termContexts, TermsEnum termsEnum) throws IOException {
+    List<PostingsEnum> postingsEnums = new LinkedList<>();
+    for (int i = 0; i < terms.length; i++) {
+      final Term term = terms[i];
+      TermState termState = termContexts.get(term).get(context.ord);
+      if (termState == null) {
+        // Term doesn't exist in reader
+        continue;
+      }
+      termsEnum.seekExact(term.bytes(), termState);
+      PostingsEnum postings = termsEnum.postings(liveDocs, null, PostingsEnum.POSITIONS);
+      cost += postings.cost();
+      postingsEnums.add(postings);
+    }
+
+    _queue = new DocsQueue(postingsEnums);
+    _posList = new IntQueue();
+  }
+
+  @Override
+  public final int nextDoc() throws IOException {
+    if (_queue.size() == 0) {
+      return _doc = NO_MORE_DOCS;
+    }
+
+    // TODO: move this init into positions(): if the search
+    // doesn't need the positions for this doc then don't
+    // waste CPU merging them:
+    _posList.clear();
+    _doc = _queue.top().docID();
+
+    // merge sort all positions together
+    PostingsEnum postings;
+    do {
+      postings = _queue.top();
+
+      final int freq = postings.freq();
+      for (int i = 0; i < freq; i++) {
+        _posList.add(postings.nextPosition());
+      }
+
+      if (postings.nextDoc() != NO_MORE_DOCS) {
+        _queue.updateTop();
+      } else {
+        _queue.pop();
+      }
+    } while (_queue.size() > 0 && _queue.top().docID() == _doc);
+
+    _posList.sort();
+    _freq = _posList.size();
+
+    return _doc;
+  }
+
+  @Override
+  public int nextPosition() {
+    return _posList.next();
+  }
+
+  @Override
+  public int startOffset() {
+    return -1;
+  }
+
+  @Override
+  public int endOffset() {
+    return -1;
+  }
+
+  @Override
+  public BytesRef getPayload() {
+    return null;
+  }
+
+  @Override
+  public final int advance(int target) throws IOException {
+    while (_queue.top() != null && target > _queue.top().docID()) {
+      PostingsEnum postings = _queue.pop();
+      if (postings.advance(target) != NO_MORE_DOCS) {
+        _queue.add(postings);
+      }
+    }
+    return nextDoc();
+  }
+
+  @Override
+  public final int freq() {
+    return _freq;
+  }
+
+  @Override
+  public final int docID() {
+    return _doc;
+  }
+
+  @Override
+  public long cost() {
+    return cost;
  }
 }
--- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
@ -174,12 +174,14 @@ public class PhraseQuery extends Query {

  static class PostingsAndFreq implements Comparable<PostingsAndFreq> {
    final PostingsEnum postings;
+    final int docFreq;
    final int position;
    final Term[] terms;
    final int nTerms; // for faster comparisons

-    public PostingsAndFreq(PostingsEnum postings, int position, Term... terms) {
+    public PostingsAndFreq(PostingsEnum postings, int docFreq, int position, Term... terms) {
      this.postings = postings;
+      this.docFreq = docFreq;
      this.position = position;
      nTerms = terms==null ? 0 : terms.length;
      if (nTerms>0) {
@ -198,6 +200,9 @@ public class PhraseQuery extends Query {

    @Override
    public int compareTo(PostingsAndFreq other) {
+      if (docFreq != other.docFreq) {
+        return docFreq - other.docFreq;
+      }
      if (position != other.position) {
        return position - other.position;
      }
@ -218,6 +223,7 @@ public class PhraseQuery extends Query {
    public int hashCode() {
      final int prime = 31;
      int result = 1;
+      result = prime * result + docFreq;
      result = prime * result + position;
      for (int i=0; i<nTerms; i++) {
        result = prime * result + terms[i].hashCode(); 
@ -231,6 +237,7 @@ public class PhraseQuery extends Query {
      if (obj == null) return false;
      if (getClass() != obj.getClass()) return false;
      PostingsAndFreq other = (PostingsAndFreq) obj;
+      if (docFreq != other.docFreq) return false;
      if (position != other.position) return false;
      if (terms == null) return other.terms == null;
      return Arrays.equals(terms, other.terms);
@ -306,7 +313,7 @@ public class PhraseQuery extends Query {
        }
        te.seekExact(t.bytes(), state);
        PostingsEnum postingsEnum = te.postings(liveDocs, null, PostingsEnum.POSITIONS);
-        postingsFreqs[i] = new PostingsAndFreq(postingsEnum, positions.get(i), t);
+        postingsFreqs[i] = new PostingsAndFreq(postingsEnum, te.docFreq(), positions.get(i), t);
      }

      // sort by increasing docFreq order