From a5877d42b0f8a32f6e62de2c8967bfd5082a728d Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Wed, 15 Apr 2015 00:54:03 +0000 Subject: [PATCH] LUCENE-6421: defer reading of positions in MultiPhraseQuery until they are needed git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1673595 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 3 + .../lucene/search/MultiPhraseQuery.java | 344 ++++++++---------- .../org/apache/lucene/search/PhraseQuery.java | 11 +- .../lucene/search/TestMultiPhraseEnum.java | 116 ++++++ .../search/SearchEquivalenceTestBase.java | 2 +- 5 files changed, 280 insertions(+), 196 deletions(-) create mode 100644 lucene/core/src/test/org/apache/lucene/search/TestMultiPhraseEnum.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 25c99dc152e..0d43e4a281e 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -77,6 +77,9 @@ Optimizations * LUCENE-6388: Optimize SpanNearQuery when payloads are not present. (Robert Muir) +* LUCENE-6421: Defer reading of positions in MultiPhraseQuery until + they are needed. (Robert Muir) + Bug Fixes * LUCENE-6378: Fix all RuntimeExceptions to throw the underlying root cause. diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java index 7a349db8bf6..524a98337bb 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java @@ -200,45 +200,28 @@ public class MultiPhraseQuery extends Query { for (int pos=0; pos 1) { - postingsEnum = new UnionPostingsEnum(liveDocs, context, terms, termContexts, termsEnum); - - // coarse -- this overcounts since a given doc can - // have more than one term: - docFreq = 0; - for(int termIdx=0;termIdx postings = new ArrayList<>(); + + for (Term term : terms) { TermState termState = termContexts.get(term).get(context.ord); - if (termState == null) { - // Term not in reader - return null; + if (termState != null) { + termsEnum.seekExact(term.bytes(), termState); + postings.add(termsEnum.postings(liveDocs, null, PostingsEnum.POSITIONS)); } - termsEnum.seekExact(term.bytes(), termState); - postingsEnum = termsEnum.postings(liveDocs, null, PostingsEnum.POSITIONS); - - docFreq = termsEnum.docFreq(); + } + + if (postings.isEmpty()) { + return null; + } + + final PostingsEnum postingsEnum; + if (postings.size() == 1) { + postingsEnum = postings.get(0); + } else { + postingsEnum = new UnionPostingsEnum(postings); } - postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, positions.get(pos).intValue(), terms); + postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, positions.get(pos).intValue(), terms); } // sort by increasing docFreq order @@ -398,175 +381,164 @@ public class MultiPhraseQuery extends Query { } return true; } -} - -/** - * Takes the logical union of multiple DocsEnum iterators. - */ - -// TODO: if ever we allow subclassing of the *PhraseScorer -class UnionPostingsEnum extends PostingsEnum { - - private static final class DocsQueue extends PriorityQueue { - DocsQueue(List postingsEnums) throws IOException { - super(postingsEnums.size()); - - Iterator i = postingsEnums.iterator(); - while (i.hasNext()) { - PostingsEnum postings = i.next(); - if (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { - add(postings); - } + + /** + * Takes the logical union of multiple PostingsEnum iterators. + *

+ * Note: positions are merged during freq() + */ + static class UnionPostingsEnum extends PostingsEnum { + /** queue ordered by docid */ + final DocsQueue docsQueue; + /** cost of this enum: sum of its subs */ + final long cost; + + /** queue ordered by position for current doc */ + final PositionsQueue posQueue = new PositionsQueue(); + /** current doc posQueue is working */ + int posQueueDoc = -2; + /** list of subs (unordered) */ + final PostingsEnum[] subs; + + UnionPostingsEnum(Collection subs) { + docsQueue = new DocsQueue(subs.size()); + long cost = 0; + for (PostingsEnum sub : subs) { + docsQueue.add(sub); + cost += sub.cost(); } + this.cost = cost; + this.subs = subs.toArray(new PostingsEnum[subs.size()]); } @Override - public final boolean lessThan(PostingsEnum a, PostingsEnum b) { - return a.docID() < b.docID(); + public int freq() throws IOException { + int doc = docID(); + if (doc != posQueueDoc) { + posQueue.clear(); + for (PostingsEnum sub : subs) { + if (sub.docID() == doc) { + int freq = sub.freq(); + for (int i = 0; i < freq; i++) { + posQueue.add(sub.nextPosition()); + } + } + } + posQueue.sort(); + posQueueDoc = doc; + } + return posQueue.size(); } - } - private static final class IntQueue { - private int _arraySize = 16; - private int _index = 0; - private int _lastIndex = 0; - private int[] _array = new int[_arraySize]; + @Override + public int nextPosition() throws IOException { + return posQueue.next(); + } + + @Override + public int docID() { + return docsQueue.top().docID(); + } + + @Override + public int nextDoc() throws IOException { + PostingsEnum top = docsQueue.top(); + int doc = top.docID(); + + do { + top.nextDoc(); + top = docsQueue.updateTop(); + } while (top.docID() == doc); + + return top.docID(); + } + + @Override + public int advance(int target) throws IOException { + PostingsEnum top = docsQueue.top(); + + do { + top.advance(target); + top = docsQueue.updateTop(); + } while (top.docID() < target); + + return top.docID(); + } + + @Override + public long cost() { + return cost; + } - final void add(int i) { - if (_lastIndex == _arraySize) - growArray(); - - _array[_lastIndex++] = i; + @Override + public int startOffset() throws IOException { + return -1; // offsets are unsupported } - final int next() { - return _array[_index++]; + @Override + public int endOffset() throws IOException { + return -1; // offsets are unsupported } - final void sort() { - Arrays.sort(_array, _index, _lastIndex); + @Override + public BytesRef getPayload() throws IOException { + return null; // payloads are unsupported } - - final void clear() { - _index = 0; - _lastIndex = 0; - } - - final int size() { - return (_lastIndex - _index); - } - - private void growArray() { - int[] newArray = new int[_arraySize * 2]; - System.arraycopy(_array, 0, newArray, 0, _arraySize); - _array = newArray; - _arraySize *= 2; - } - } - - private int _doc = -1; - private int _freq; - private DocsQueue _queue; - private IntQueue _posList; - private long cost; - - public UnionPostingsEnum(Bits liveDocs, LeafReaderContext context, Term[] terms, Map termContexts, TermsEnum termsEnum) throws IOException { - List postingsEnums = new LinkedList<>(); - for (int i = 0; i < terms.length; i++) { - final Term term = terms[i]; - TermState termState = termContexts.get(term).get(context.ord); - if (termState == null) { - // Term doesn't exist in reader - continue; - } - termsEnum.seekExact(term.bytes(), termState); - PostingsEnum postings = termsEnum.postings(liveDocs, null, PostingsEnum.POSITIONS); - cost += postings.cost(); - postingsEnums.add(postings); - } - - _queue = new DocsQueue(postingsEnums); - _posList = new IntQueue(); - } - - @Override - public final int nextDoc() throws IOException { - if (_queue.size() == 0) { - return _doc = NO_MORE_DOCS; - } - - // TODO: move this init into positions(): if the search - // doesn't need the positions for this doc then don't - // waste CPU merging them: - _posList.clear(); - _doc = _queue.top().docID(); - - // merge sort all positions together - PostingsEnum postings; - do { - postings = _queue.top(); - - final int freq = postings.freq(); - for (int i = 0; i < freq; i++) { - _posList.add(postings.nextPosition()); + + /** + * disjunction of postings ordered by docid. + */ + static class DocsQueue extends PriorityQueue { + DocsQueue(int size) { + super(size); } - if (postings.nextDoc() != NO_MORE_DOCS) { - _queue.updateTop(); - } else { - _queue.pop(); - } - } while (_queue.size() > 0 && _queue.top().docID() == _doc); - - _posList.sort(); - _freq = _posList.size(); - - return _doc; - } - - @Override - public int nextPosition() { - return _posList.next(); - } - - @Override - public int startOffset() { - return -1; - } - - @Override - public int endOffset() { - return -1; - } - - @Override - public BytesRef getPayload() { - return null; - } - - @Override - public final int advance(int target) throws IOException { - while (_queue.top() != null && target > _queue.top().docID()) { - PostingsEnum postings = _queue.pop(); - if (postings.advance(target) != NO_MORE_DOCS) { - _queue.add(postings); + @Override + public final boolean lessThan(PostingsEnum a, PostingsEnum b) { + return a.docID() < b.docID(); } } - return nextDoc(); - } + + /** + * queue of terms for a single document. its a sorted array of + * all the positions from all the postings + */ + static class PositionsQueue { + private int arraySize = 16; + private int index = 0; + private int size = 0; + private int[] array = new int[arraySize]; + + void add(int i) { + if (size == arraySize) + growArray(); - @Override - public final int freq() { - return _freq; - } + array[size++] = i; + } - @Override - public final int docID() { - return _doc; - } + int next() { + return array[index++]; + } - @Override - public long cost() { - return cost; + void sort() { + Arrays.sort(array, index, size); + } + + void clear() { + index = 0; + size = 0; + } + + int size() { + return size; + } + + private void growArray() { + int[] newArray = new int[arraySize * 2]; + System.arraycopy(array, 0, newArray, 0, arraySize); + array = newArray; + arraySize *= 2; + } + } } } diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java index 95bfb28c06c..bc809d2db7b 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java @@ -174,14 +174,12 @@ public class PhraseQuery extends Query { static class PostingsAndFreq implements Comparable { final PostingsEnum postings; - final int docFreq; final int position; final Term[] terms; final int nTerms; // for faster comparisons - public PostingsAndFreq(PostingsEnum postings, int docFreq, int position, Term... terms) { + public PostingsAndFreq(PostingsEnum postings, int position, Term... terms) { this.postings = postings; - this.docFreq = docFreq; this.position = position; nTerms = terms==null ? 0 : terms.length; if (nTerms>0) { @@ -200,9 +198,6 @@ public class PhraseQuery extends Query { @Override public int compareTo(PostingsAndFreq other) { - if (docFreq != other.docFreq) { - return docFreq - other.docFreq; - } if (position != other.position) { return position - other.position; } @@ -223,7 +218,6 @@ public class PhraseQuery extends Query { public int hashCode() { final int prime = 31; int result = 1; - result = prime * result + docFreq; result = prime * result + position; for (int i=0; i " + td2.totalHits, td1.totalHits <= td2.totalHits); // fill the superset into a bitset BitSet bitset = new BitSet();