From a5877d42b0f8a32f6e62de2c8967bfd5082a728d Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Wed, 15 Apr 2015 00:54:03 +0000
Subject: [PATCH] LUCENE-6421: defer reading of positions in MultiPhraseQuery
 until they are needed

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1673595 13f79535-47bb-0310-9956-ffa450edef68
---
 lucene/CHANGES.txt                            |   3 +
 .../lucene/search/MultiPhraseQuery.java       | 344 ++++++++----------
 .../org/apache/lucene/search/PhraseQuery.java |  11 +-
 .../lucene/search/TestMultiPhraseEnum.java    | 116 ++++++
 .../search/SearchEquivalenceTestBase.java     |   2 +-
 5 files changed, 280 insertions(+), 196 deletions(-)
 create mode 100644 lucene/core/src/test/org/apache/lucene/search/TestMultiPhraseEnum.java
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 25c99dc152e..0d43e4a281e 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -77,6 +77,9 @@ Optimizations
 * LUCENE-6388: Optimize SpanNearQuery when payloads are not present.
   (Robert Muir)
 
+* LUCENE-6421: Defer reading of positions in MultiPhraseQuery until
+  they are needed. (Robert Muir)
+
 Bug Fixes
 
 * LUCENE-6378: Fix all RuntimeExceptions to throw the underlying root cause.
diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
index 7a349db8bf6..524a98337bb 100644
--- a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
@@ -200,45 +200,28 @@ public class MultiPhraseQuery extends Query {
 
       for (int pos=0; pos<postingsFreqs.length; pos++) {
         Term[] terms = termArrays.get(pos);
-
-        final PostingsEnum postingsEnum;
-        int docFreq;
-
-        if (terms.length > 1) {
-          postingsEnum = new UnionPostingsEnum(liveDocs, context, terms, termContexts, termsEnum);
-
-          // coarse -- this overcounts since a given doc can
-          // have more than one term:
-          docFreq = 0;
-          for(int termIdx=0;termIdx<terms.length;termIdx++) {
-            final Term term = terms[termIdx];
-            TermState termState = termContexts.get(term).get(context.ord);
-            if (termState == null) {
-              // Term not in reader
-              continue;
-            }
-            termsEnum.seekExact(term.bytes(), termState);
-            docFreq += termsEnum.docFreq();
-          }
-
-          if (docFreq == 0) {
-            // None of the terms are in this reader
-            return null;
-          }
-        } else {
-          final Term term = terms[0];
+        List<PostingsEnum> postings = new ArrayList<>();
+        
+        for (Term term : terms) {
           TermState termState = termContexts.get(term).get(context.ord);
-          if (termState == null) {
-            // Term not in reader
-            return null;
+          if (termState != null) {
+            termsEnum.seekExact(term.bytes(), termState);
+            postings.add(termsEnum.postings(liveDocs, null, PostingsEnum.POSITIONS));
           }
-          termsEnum.seekExact(term.bytes(), termState);
-          postingsEnum = termsEnum.postings(liveDocs, null, PostingsEnum.POSITIONS);
-
-          docFreq = termsEnum.docFreq();
+        }
+        
+        if (postings.isEmpty()) {
+          return null;
+        }
+        
+        final PostingsEnum postingsEnum;
+        if (postings.size() == 1) {
+          postingsEnum = postings.get(0);
+        } else {
+          postingsEnum = new UnionPostingsEnum(postings);
         }
 
-        postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, positions.get(pos).intValue(), terms);
+        postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, positions.get(pos).intValue(), terms);
       }
 
       // sort by increasing docFreq order
@@ -398,175 +381,164 @@ public class MultiPhraseQuery extends Query {
     }
     return true;
   }
-}
-
-/**
- * Takes the logical union of multiple DocsEnum iterators.
- */
-
-// TODO: if ever we allow subclassing of the *PhraseScorer
-class UnionPostingsEnum extends PostingsEnum {
-
-  private static final class DocsQueue extends PriorityQueue<PostingsEnum> {
-    DocsQueue(List<PostingsEnum> postingsEnums) throws IOException {
-      super(postingsEnums.size());
-
-      Iterator<PostingsEnum> i = postingsEnums.iterator();
-      while (i.hasNext()) {
-        PostingsEnum postings = i.next();
-        if (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
-          add(postings);
-        }
+  
+  /** 
+   * Takes the logical union of multiple PostingsEnum iterators.
+   * <p>
+   * Note: positions are merged during freq()
+   */
+  static class UnionPostingsEnum extends PostingsEnum {
+    /** queue ordered by docid */
+    final DocsQueue docsQueue;
+    /** cost of this enum: sum of its subs */
+    final long cost;
+    
+    /** queue ordered by position for current doc */
+    final PositionsQueue posQueue = new PositionsQueue();
+    /** current doc posQueue is working */
+    int posQueueDoc = -2;
+    /** list of subs (unordered) */
+    final PostingsEnum[] subs;
+    
+    UnionPostingsEnum(Collection<PostingsEnum> subs) {
+      docsQueue = new DocsQueue(subs.size());
+      long cost = 0;
+      for (PostingsEnum sub : subs) {
+        docsQueue.add(sub);
+        cost += sub.cost();
       }
+      this.cost = cost;
+      this.subs = subs.toArray(new PostingsEnum[subs.size()]);
     }
 
     @Override
-    public final boolean lessThan(PostingsEnum a, PostingsEnum b) {
-      return a.docID() < b.docID();
+    public int freq() throws IOException {
+      int doc = docID();
+      if (doc != posQueueDoc) {
+        posQueue.clear();
+        for (PostingsEnum sub : subs) {
+          if (sub.docID() == doc) {
+            int freq = sub.freq();
+            for (int i = 0; i < freq; i++) {
+              posQueue.add(sub.nextPosition());
+            }
+          }
+        }
+        posQueue.sort();
+        posQueueDoc = doc;
+      }
+      return posQueue.size();
     }
-  }
 
-  private static final class IntQueue {
-    private int _arraySize = 16;
-    private int _index = 0;
-    private int _lastIndex = 0;
-    private int[] _array = new int[_arraySize];
+    @Override
+    public int nextPosition() throws IOException {
+      return posQueue.next();
+    }
+
+    @Override
+    public int docID() {
+      return docsQueue.top().docID();
+    }
+
+    @Override
+    public int nextDoc() throws IOException {
+      PostingsEnum top = docsQueue.top();
+      int doc = top.docID();
+      
+      do {
+        top.nextDoc();
+        top = docsQueue.updateTop();
+      } while (top.docID() == doc);
+
+      return top.docID();
+    }
+
+    @Override
+    public int advance(int target) throws IOException {
+      PostingsEnum top = docsQueue.top();
+      
+      do {
+        top.advance(target);
+        top = docsQueue.updateTop();
+      } while (top.docID() < target);
+
+      return top.docID();
+    }
+
+    @Override
+    public long cost() {
+      return cost;
+    }
     
-    final void add(int i) {
-      if (_lastIndex == _arraySize)
-        growArray();
-
-      _array[_lastIndex++] = i;
+    @Override
+    public int startOffset() throws IOException {
+      return -1; // offsets are unsupported
     }
 
-    final int next() {
-      return _array[_index++];
+    @Override
+    public int endOffset() throws IOException {
+      return -1; // offsets are unsupported
     }
 
-    final void sort() {
-      Arrays.sort(_array, _index, _lastIndex);
+    @Override
+    public BytesRef getPayload() throws IOException {
+      return null; // payloads are unsupported
     }
-
-    final void clear() {
-      _index = 0;
-      _lastIndex = 0;
-    }
-
-    final int size() {
-      return (_lastIndex - _index);
-    }
-
-    private void growArray() {
-      int[] newArray = new int[_arraySize * 2];
-      System.arraycopy(_array, 0, newArray, 0, _arraySize);
-      _array = newArray;
-      _arraySize *= 2;
-    }
-  }
-
-  private int _doc = -1;
-  private int _freq;
-  private DocsQueue _queue;
-  private IntQueue _posList;
-  private long cost;
-
-  public UnionPostingsEnum(Bits liveDocs, LeafReaderContext context, Term[] terms, Map<Term, TermContext> termContexts, TermsEnum termsEnum) throws IOException {
-    List<PostingsEnum> postingsEnums = new LinkedList<>();
-    for (int i = 0; i < terms.length; i++) {
-      final Term term = terms[i];
-      TermState termState = termContexts.get(term).get(context.ord);
-      if (termState == null) {
-        // Term doesn't exist in reader
-        continue;
-      }
-      termsEnum.seekExact(term.bytes(), termState);
-      PostingsEnum postings = termsEnum.postings(liveDocs, null, PostingsEnum.POSITIONS);
-      cost += postings.cost();
-      postingsEnums.add(postings);
-    }
-
-    _queue = new DocsQueue(postingsEnums);
-    _posList = new IntQueue();
-  }
-
-  @Override
-  public final int nextDoc() throws IOException {
-    if (_queue.size() == 0) {
-      return _doc = NO_MORE_DOCS;
-    }
-
-    // TODO: move this init into positions(): if the search
-    // doesn't need the positions for this doc then don't
-    // waste CPU merging them:
-    _posList.clear();
-    _doc = _queue.top().docID();
-
-    // merge sort all positions together
-    PostingsEnum postings;
-    do {
-      postings = _queue.top();
-
-      final int freq = postings.freq();
-      for (int i = 0; i < freq; i++) {
-        _posList.add(postings.nextPosition());
+    
+    /** 
+     * disjunction of postings ordered by docid.
+     */
+    static class DocsQueue extends PriorityQueue<PostingsEnum> {
+      DocsQueue(int size) {
+        super(size);
       }
 
-      if (postings.nextDoc() != NO_MORE_DOCS) {
-        _queue.updateTop();
-      } else {
-        _queue.pop();
-      }
-    } while (_queue.size() > 0 && _queue.top().docID() == _doc);
-
-    _posList.sort();
-    _freq = _posList.size();
-
-    return _doc;
-  }
-
-  @Override
-  public int nextPosition() {
-    return _posList.next();
-  }
-
-  @Override
-  public int startOffset() {
-    return -1;
-  }
-
-  @Override
-  public int endOffset() {
-    return -1;
-  }
-
-  @Override
-  public BytesRef getPayload() {
-    return null;
-  }
-
-  @Override
-  public final int advance(int target) throws IOException {
-    while (_queue.top() != null && target > _queue.top().docID()) {
-      PostingsEnum postings = _queue.pop();
-      if (postings.advance(target) != NO_MORE_DOCS) {
-        _queue.add(postings);
+      @Override
+      public final boolean lessThan(PostingsEnum a, PostingsEnum b) {
+        return a.docID() < b.docID();
       }
     }
-    return nextDoc();
-  }
+    
+    /** 
+     * queue of terms for a single document. its a sorted array of
+     * all the positions from all the postings
+     */
+    static class PositionsQueue {
+      private int arraySize = 16;
+      private int index = 0;
+      private int size = 0;
+      private int[] array = new int[arraySize];
+      
+      void add(int i) {
+        if (size == arraySize)
+          growArray();
 
-  @Override
-  public final int freq() {
-    return _freq;
-  }
+        array[size++] = i;
+      }
 
-  @Override
-  public final int docID() {
-    return _doc;
-  }
+      int next() {
+        return array[index++];
+      }
 
-  @Override
-  public long cost() {
-    return cost;
+      void sort() {
+        Arrays.sort(array, index, size);
+      }
+
+      void clear() {
+        index = 0;
+        size = 0;
+      }
+
+      int size() {
+        return size;
+      }
+
+      private void growArray() {
+        int[] newArray = new int[arraySize * 2];
+        System.arraycopy(array, 0, newArray, 0, arraySize);
+        array = newArray;
+        arraySize *= 2;
+      }
+    }
   }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
index 95bfb28c06c..bc809d2db7b 100644
--- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
@@ -174,14 +174,12 @@ public class PhraseQuery extends Query {
 
   static class PostingsAndFreq implements Comparable<PostingsAndFreq> {
     final PostingsEnum postings;
-    final int docFreq;
     final int position;
     final Term[] terms;
     final int nTerms; // for faster comparisons
 
-    public PostingsAndFreq(PostingsEnum postings, int docFreq, int position, Term... terms) {
+    public PostingsAndFreq(PostingsEnum postings, int position, Term... terms) {
       this.postings = postings;
-      this.docFreq = docFreq;
       this.position = position;
       nTerms = terms==null ? 0 : terms.length;
       if (nTerms>0) {
@@ -200,9 +198,6 @@ public class PhraseQuery extends Query {
 
     @Override
     public int compareTo(PostingsAndFreq other) {
-      if (docFreq != other.docFreq) {
-        return docFreq - other.docFreq;
-      }
       if (position != other.position) {
         return position - other.position;
       }
@@ -223,7 +218,6 @@ public class PhraseQuery extends Query {
     public int hashCode() {
       final int prime = 31;
       int result = 1;
-      result = prime * result + docFreq;
       result = prime * result + position;
       for (int i=0; i<nTerms; i++) {
         result = prime * result + terms[i].hashCode(); 
@@ -237,7 +231,6 @@ public class PhraseQuery extends Query {
       if (obj == null) return false;
       if (getClass() != obj.getClass()) return false;
       PostingsAndFreq other = (PostingsAndFreq) obj;
-      if (docFreq != other.docFreq) return false;
       if (position != other.position) return false;
       if (terms == null) return other.terms == null;
       return Arrays.equals(terms, other.terms);
@@ -313,7 +306,7 @@ public class PhraseQuery extends Query {
         }
         te.seekExact(t.bytes(), state);
         PostingsEnum postingsEnum = te.postings(liveDocs, null, PostingsEnum.POSITIONS);
-        postingsFreqs[i] = new PostingsAndFreq(postingsEnum, te.docFreq(), positions.get(i), t);
+        postingsFreqs[i] = new PostingsAndFreq(postingsEnum, positions.get(i), t);
       }
 
       // sort by increasing docFreq order
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMultiPhraseEnum.java b/lucene/core/src/test/org/apache/lucene/search/TestMultiPhraseEnum.java
new file mode 100644
index 00000000000..d31d6524836
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/search/TestMultiPhraseEnum.java
@@ -0,0 +1,116 @@
+package org.apache.lucene.search;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+
+/** simple tests for unionpostingsenum */
+public class TestMultiPhraseEnum extends LuceneTestCase {
+  
+  /** Tests union on one document  */
+  public void testOneDocument() throws IOException {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig();
+    iwc.setMergePolicy(newLogMergePolicy());
+    IndexWriter writer = new IndexWriter(dir, iwc);
+    
+    Document doc = new Document();
+    doc.add(new TextField("field", "foo bar", Field.Store.NO));
+    writer.addDocument(doc);
+    
+    DirectoryReader ir = DirectoryReader.open(writer, true);
+    writer.close();
+
+    PostingsEnum p1 = getOnlySegmentReader(ir).postings(new Term("field", "foo"), PostingsEnum.POSITIONS);
+    PostingsEnum p2 = getOnlySegmentReader(ir).postings(new Term("field", "bar"), PostingsEnum.POSITIONS);
+    PostingsEnum union = new MultiPhraseQuery.UnionPostingsEnum(Arrays.asList(p1, p2));
+    
+    assertEquals(-1, union.docID());
+    
+    assertEquals(0, union.nextDoc());
+    assertEquals(2, union.freq());
+    assertEquals(0, union.nextPosition());
+    assertEquals(1, union.nextPosition());
+    
+    assertEquals(DocIdSetIterator.NO_MORE_DOCS, union.nextDoc());
+    
+    ir.close();
+    dir.close();
+  }
+  
+  /** Tests union on a few documents  */
+  public void testSomeDocuments() throws IOException {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig();
+    iwc.setMergePolicy(newLogMergePolicy());
+    IndexWriter writer = new IndexWriter(dir, iwc);
+    
+    Document doc = new Document();
+    doc.add(new TextField("field", "foo", Field.Store.NO));
+    writer.addDocument(doc);
+    
+    writer.addDocument(new Document());
+    
+    doc = new Document();
+    doc.add(new TextField("field", "foo bar", Field.Store.NO));
+    writer.addDocument(doc);
+    
+    doc = new Document();
+    doc.add(new TextField("field", "bar", Field.Store.NO));
+    writer.addDocument(doc);
+    
+    DirectoryReader ir = DirectoryReader.open(writer, true);
+    writer.close();
+
+    PostingsEnum p1 = getOnlySegmentReader(ir).postings(new Term("field", "foo"), PostingsEnum.POSITIONS);
+    PostingsEnum p2 = getOnlySegmentReader(ir).postings(new Term("field", "bar"), PostingsEnum.POSITIONS);
+    PostingsEnum union = new MultiPhraseQuery.UnionPostingsEnum(Arrays.asList(p1, p2));
+    
+    assertEquals(-1, union.docID());
+    
+    assertEquals(0, union.nextDoc());
+    assertEquals(1, union.freq());
+    assertEquals(0, union.nextPosition());
+    
+    assertEquals(2, union.nextDoc());
+    assertEquals(2, union.freq());
+    assertEquals(0, union.nextPosition());
+    assertEquals(1, union.nextPosition());
+    
+    assertEquals(3, union.nextDoc());
+    assertEquals(1, union.freq());
+    assertEquals(0, union.nextPosition());
+    
+    assertEquals(DocIdSetIterator.NO_MORE_DOCS, union.nextDoc());
+    
+    ir.close();
+    dir.close();
+  }
+}
diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/SearchEquivalenceTestBase.java b/lucene/test-framework/src/java/org/apache/lucene/search/SearchEquivalenceTestBase.java
index c756d787852..80d9b3b0da0 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/search/SearchEquivalenceTestBase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/search/SearchEquivalenceTestBase.java
@@ -275,7 +275,7 @@ public abstract class SearchEquivalenceTestBase extends LuceneTestCase {
       // not efficient, but simple!
       TopDocs td1 = s1.search(q1, reader.maxDoc(), sort);
       TopDocs td2 = s2.search(q2, reader.maxDoc(), sort);
-      assertTrue(td1.totalHits <= td2.totalHits);
+      assertTrue("too many hits: " + td1.totalHits + " > " + td2.totalHits, td1.totalHits <= td2.totalHits);
       
       // fill the superset into a bitset
       BitSet bitset = new BitSet();