mirror of https://github.com/apache/lucene.git
LUCENE-6421: defer reading of positions in MultiPhraseQuery until they are needed
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1673595 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
0aa98d464c
commit
a5877d42b0
|
@ -77,6 +77,9 @@ Optimizations
|
|||
* LUCENE-6388: Optimize SpanNearQuery when payloads are not present.
|
||||
(Robert Muir)
|
||||
|
||||
* LUCENE-6421: Defer reading of positions in MultiPhraseQuery until
|
||||
they are needed. (Robert Muir)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-6378: Fix all RuntimeExceptions to throw the underlying root cause.
|
||||
|
|
|
@ -200,45 +200,28 @@ public class MultiPhraseQuery extends Query {
|
|||
|
||||
for (int pos=0; pos<postingsFreqs.length; pos++) {
|
||||
Term[] terms = termArrays.get(pos);
|
||||
List<PostingsEnum> postings = new ArrayList<>();
|
||||
|
||||
final PostingsEnum postingsEnum;
|
||||
int docFreq;
|
||||
|
||||
if (terms.length > 1) {
|
||||
postingsEnum = new UnionPostingsEnum(liveDocs, context, terms, termContexts, termsEnum);
|
||||
|
||||
// coarse -- this overcounts since a given doc can
|
||||
// have more than one term:
|
||||
docFreq = 0;
|
||||
for(int termIdx=0;termIdx<terms.length;termIdx++) {
|
||||
final Term term = terms[termIdx];
|
||||
TermState termState = termContexts.get(term).get(context.ord);
|
||||
if (termState == null) {
|
||||
// Term not in reader
|
||||
continue;
|
||||
}
|
||||
termsEnum.seekExact(term.bytes(), termState);
|
||||
docFreq += termsEnum.docFreq();
|
||||
}
|
||||
|
||||
if (docFreq == 0) {
|
||||
// None of the terms are in this reader
|
||||
return null;
|
||||
}
|
||||
} else {
|
||||
final Term term = terms[0];
|
||||
for (Term term : terms) {
|
||||
TermState termState = termContexts.get(term).get(context.ord);
|
||||
if (termState == null) {
|
||||
// Term not in reader
|
||||
return null;
|
||||
if (termState != null) {
|
||||
termsEnum.seekExact(term.bytes(), termState);
|
||||
postings.add(termsEnum.postings(liveDocs, null, PostingsEnum.POSITIONS));
|
||||
}
|
||||
termsEnum.seekExact(term.bytes(), termState);
|
||||
postingsEnum = termsEnum.postings(liveDocs, null, PostingsEnum.POSITIONS);
|
||||
|
||||
docFreq = termsEnum.docFreq();
|
||||
}
|
||||
|
||||
postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, positions.get(pos).intValue(), terms);
|
||||
if (postings.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final PostingsEnum postingsEnum;
|
||||
if (postings.size() == 1) {
|
||||
postingsEnum = postings.get(0);
|
||||
} else {
|
||||
postingsEnum = new UnionPostingsEnum(postings);
|
||||
}
|
||||
|
||||
postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, positions.get(pos).intValue(), terms);
|
||||
}
|
||||
|
||||
// sort by increasing docFreq order
|
||||
|
@ -398,175 +381,164 @@ public class MultiPhraseQuery extends Query {
|
|||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes the logical union of multiple DocsEnum iterators.
|
||||
*/
|
||||
/**
|
||||
* Takes the logical union of multiple PostingsEnum iterators.
|
||||
* <p>
|
||||
* Note: positions are merged during freq()
|
||||
*/
|
||||
static class UnionPostingsEnum extends PostingsEnum {
|
||||
/** queue ordered by docid */
|
||||
final DocsQueue docsQueue;
|
||||
/** cost of this enum: sum of its subs */
|
||||
final long cost;
|
||||
|
||||
// TODO: if ever we allow subclassing of the *PhraseScorer
|
||||
class UnionPostingsEnum extends PostingsEnum {
|
||||
/** queue ordered by position for current doc */
|
||||
final PositionsQueue posQueue = new PositionsQueue();
|
||||
/** current doc posQueue is working */
|
||||
int posQueueDoc = -2;
|
||||
/** list of subs (unordered) */
|
||||
final PostingsEnum[] subs;
|
||||
|
||||
private static final class DocsQueue extends PriorityQueue<PostingsEnum> {
|
||||
DocsQueue(List<PostingsEnum> postingsEnums) throws IOException {
|
||||
super(postingsEnums.size());
|
||||
|
||||
Iterator<PostingsEnum> i = postingsEnums.iterator();
|
||||
while (i.hasNext()) {
|
||||
PostingsEnum postings = i.next();
|
||||
if (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
add(postings);
|
||||
}
|
||||
UnionPostingsEnum(Collection<PostingsEnum> subs) {
|
||||
docsQueue = new DocsQueue(subs.size());
|
||||
long cost = 0;
|
||||
for (PostingsEnum sub : subs) {
|
||||
docsQueue.add(sub);
|
||||
cost += sub.cost();
|
||||
}
|
||||
this.cost = cost;
|
||||
this.subs = subs.toArray(new PostingsEnum[subs.size()]);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean lessThan(PostingsEnum a, PostingsEnum b) {
|
||||
return a.docID() < b.docID();
|
||||
}
|
||||
}
|
||||
|
||||
private static final class IntQueue {
|
||||
private int _arraySize = 16;
|
||||
private int _index = 0;
|
||||
private int _lastIndex = 0;
|
||||
private int[] _array = new int[_arraySize];
|
||||
|
||||
final void add(int i) {
|
||||
if (_lastIndex == _arraySize)
|
||||
growArray();
|
||||
|
||||
_array[_lastIndex++] = i;
|
||||
}
|
||||
|
||||
final int next() {
|
||||
return _array[_index++];
|
||||
}
|
||||
|
||||
final void sort() {
|
||||
Arrays.sort(_array, _index, _lastIndex);
|
||||
}
|
||||
|
||||
final void clear() {
|
||||
_index = 0;
|
||||
_lastIndex = 0;
|
||||
}
|
||||
|
||||
final int size() {
|
||||
return (_lastIndex - _index);
|
||||
}
|
||||
|
||||
private void growArray() {
|
||||
int[] newArray = new int[_arraySize * 2];
|
||||
System.arraycopy(_array, 0, newArray, 0, _arraySize);
|
||||
_array = newArray;
|
||||
_arraySize *= 2;
|
||||
}
|
||||
}
|
||||
|
||||
private int _doc = -1;
|
||||
private int _freq;
|
||||
private DocsQueue _queue;
|
||||
private IntQueue _posList;
|
||||
private long cost;
|
||||
|
||||
public UnionPostingsEnum(Bits liveDocs, LeafReaderContext context, Term[] terms, Map<Term, TermContext> termContexts, TermsEnum termsEnum) throws IOException {
|
||||
List<PostingsEnum> postingsEnums = new LinkedList<>();
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
final Term term = terms[i];
|
||||
TermState termState = termContexts.get(term).get(context.ord);
|
||||
if (termState == null) {
|
||||
// Term doesn't exist in reader
|
||||
continue;
|
||||
public int freq() throws IOException {
|
||||
int doc = docID();
|
||||
if (doc != posQueueDoc) {
|
||||
posQueue.clear();
|
||||
for (PostingsEnum sub : subs) {
|
||||
if (sub.docID() == doc) {
|
||||
int freq = sub.freq();
|
||||
for (int i = 0; i < freq; i++) {
|
||||
posQueue.add(sub.nextPosition());
|
||||
}
|
||||
}
|
||||
}
|
||||
posQueue.sort();
|
||||
posQueueDoc = doc;
|
||||
}
|
||||
termsEnum.seekExact(term.bytes(), termState);
|
||||
PostingsEnum postings = termsEnum.postings(liveDocs, null, PostingsEnum.POSITIONS);
|
||||
cost += postings.cost();
|
||||
postingsEnums.add(postings);
|
||||
return posQueue.size();
|
||||
}
|
||||
|
||||
_queue = new DocsQueue(postingsEnums);
|
||||
_posList = new IntQueue();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final int nextDoc() throws IOException {
|
||||
if (_queue.size() == 0) {
|
||||
return _doc = NO_MORE_DOCS;
|
||||
@Override
|
||||
public int nextPosition() throws IOException {
|
||||
return posQueue.next();
|
||||
}
|
||||
|
||||
// TODO: move this init into positions(): if the search
|
||||
// doesn't need the positions for this doc then don't
|
||||
// waste CPU merging them:
|
||||
_posList.clear();
|
||||
_doc = _queue.top().docID();
|
||||
@Override
|
||||
public int docID() {
|
||||
return docsQueue.top().docID();
|
||||
}
|
||||
|
||||
// merge sort all positions together
|
||||
PostingsEnum postings;
|
||||
do {
|
||||
postings = _queue.top();
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
PostingsEnum top = docsQueue.top();
|
||||
int doc = top.docID();
|
||||
|
||||
final int freq = postings.freq();
|
||||
for (int i = 0; i < freq; i++) {
|
||||
_posList.add(postings.nextPosition());
|
||||
do {
|
||||
top.nextDoc();
|
||||
top = docsQueue.updateTop();
|
||||
} while (top.docID() == doc);
|
||||
|
||||
return top.docID();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
PostingsEnum top = docsQueue.top();
|
||||
|
||||
do {
|
||||
top.advance(target);
|
||||
top = docsQueue.updateTop();
|
||||
} while (top.docID() < target);
|
||||
|
||||
return top.docID();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return cost;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() throws IOException {
|
||||
return -1; // offsets are unsupported
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
return -1; // offsets are unsupported
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef getPayload() throws IOException {
|
||||
return null; // payloads are unsupported
|
||||
}
|
||||
|
||||
/**
|
||||
* disjunction of postings ordered by docid.
|
||||
*/
|
||||
static class DocsQueue extends PriorityQueue<PostingsEnum> {
|
||||
DocsQueue(int size) {
|
||||
super(size);
|
||||
}
|
||||
|
||||
if (postings.nextDoc() != NO_MORE_DOCS) {
|
||||
_queue.updateTop();
|
||||
} else {
|
||||
_queue.pop();
|
||||
}
|
||||
} while (_queue.size() > 0 && _queue.top().docID() == _doc);
|
||||
|
||||
_posList.sort();
|
||||
_freq = _posList.size();
|
||||
|
||||
return _doc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextPosition() {
|
||||
return _posList.next();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef getPayload() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final int advance(int target) throws IOException {
|
||||
while (_queue.top() != null && target > _queue.top().docID()) {
|
||||
PostingsEnum postings = _queue.pop();
|
||||
if (postings.advance(target) != NO_MORE_DOCS) {
|
||||
_queue.add(postings);
|
||||
@Override
|
||||
public final boolean lessThan(PostingsEnum a, PostingsEnum b) {
|
||||
return a.docID() < b.docID();
|
||||
}
|
||||
}
|
||||
return nextDoc();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final int freq() {
|
||||
return _freq;
|
||||
}
|
||||
/**
|
||||
* queue of terms for a single document. its a sorted array of
|
||||
* all the positions from all the postings
|
||||
*/
|
||||
static class PositionsQueue {
|
||||
private int arraySize = 16;
|
||||
private int index = 0;
|
||||
private int size = 0;
|
||||
private int[] array = new int[arraySize];
|
||||
|
||||
@Override
|
||||
public final int docID() {
|
||||
return _doc;
|
||||
}
|
||||
void add(int i) {
|
||||
if (size == arraySize)
|
||||
growArray();
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return cost;
|
||||
array[size++] = i;
|
||||
}
|
||||
|
||||
int next() {
|
||||
return array[index++];
|
||||
}
|
||||
|
||||
void sort() {
|
||||
Arrays.sort(array, index, size);
|
||||
}
|
||||
|
||||
void clear() {
|
||||
index = 0;
|
||||
size = 0;
|
||||
}
|
||||
|
||||
int size() {
|
||||
return size;
|
||||
}
|
||||
|
||||
private void growArray() {
|
||||
int[] newArray = new int[arraySize * 2];
|
||||
System.arraycopy(array, 0, newArray, 0, arraySize);
|
||||
array = newArray;
|
||||
arraySize *= 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -174,14 +174,12 @@ public class PhraseQuery extends Query {
|
|||
|
||||
static class PostingsAndFreq implements Comparable<PostingsAndFreq> {
|
||||
final PostingsEnum postings;
|
||||
final int docFreq;
|
||||
final int position;
|
||||
final Term[] terms;
|
||||
final int nTerms; // for faster comparisons
|
||||
|
||||
public PostingsAndFreq(PostingsEnum postings, int docFreq, int position, Term... terms) {
|
||||
public PostingsAndFreq(PostingsEnum postings, int position, Term... terms) {
|
||||
this.postings = postings;
|
||||
this.docFreq = docFreq;
|
||||
this.position = position;
|
||||
nTerms = terms==null ? 0 : terms.length;
|
||||
if (nTerms>0) {
|
||||
|
@ -200,9 +198,6 @@ public class PhraseQuery extends Query {
|
|||
|
||||
@Override
|
||||
public int compareTo(PostingsAndFreq other) {
|
||||
if (docFreq != other.docFreq) {
|
||||
return docFreq - other.docFreq;
|
||||
}
|
||||
if (position != other.position) {
|
||||
return position - other.position;
|
||||
}
|
||||
|
@ -223,7 +218,6 @@ public class PhraseQuery extends Query {
|
|||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
int result = 1;
|
||||
result = prime * result + docFreq;
|
||||
result = prime * result + position;
|
||||
for (int i=0; i<nTerms; i++) {
|
||||
result = prime * result + terms[i].hashCode();
|
||||
|
@ -237,7 +231,6 @@ public class PhraseQuery extends Query {
|
|||
if (obj == null) return false;
|
||||
if (getClass() != obj.getClass()) return false;
|
||||
PostingsAndFreq other = (PostingsAndFreq) obj;
|
||||
if (docFreq != other.docFreq) return false;
|
||||
if (position != other.position) return false;
|
||||
if (terms == null) return other.terms == null;
|
||||
return Arrays.equals(terms, other.terms);
|
||||
|
@ -313,7 +306,7 @@ public class PhraseQuery extends Query {
|
|||
}
|
||||
te.seekExact(t.bytes(), state);
|
||||
PostingsEnum postingsEnum = te.postings(liveDocs, null, PostingsEnum.POSITIONS);
|
||||
postingsFreqs[i] = new PostingsAndFreq(postingsEnum, te.docFreq(), positions.get(i), t);
|
||||
postingsFreqs[i] = new PostingsAndFreq(postingsEnum, positions.get(i), t);
|
||||
}
|
||||
|
||||
// sort by increasing docFreq order
|
||||
|
|
|
@ -0,0 +1,116 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
/** simple tests for unionpostingsenum */
|
||||
public class TestMultiPhraseEnum extends LuceneTestCase {
|
||||
|
||||
/** Tests union on one document */
|
||||
public void testOneDocument() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig();
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
IndexWriter writer = new IndexWriter(dir, iwc);
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(new TextField("field", "foo bar", Field.Store.NO));
|
||||
writer.addDocument(doc);
|
||||
|
||||
DirectoryReader ir = DirectoryReader.open(writer, true);
|
||||
writer.close();
|
||||
|
||||
PostingsEnum p1 = getOnlySegmentReader(ir).postings(new Term("field", "foo"), PostingsEnum.POSITIONS);
|
||||
PostingsEnum p2 = getOnlySegmentReader(ir).postings(new Term("field", "bar"), PostingsEnum.POSITIONS);
|
||||
PostingsEnum union = new MultiPhraseQuery.UnionPostingsEnum(Arrays.asList(p1, p2));
|
||||
|
||||
assertEquals(-1, union.docID());
|
||||
|
||||
assertEquals(0, union.nextDoc());
|
||||
assertEquals(2, union.freq());
|
||||
assertEquals(0, union.nextPosition());
|
||||
assertEquals(1, union.nextPosition());
|
||||
|
||||
assertEquals(DocIdSetIterator.NO_MORE_DOCS, union.nextDoc());
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
/** Tests union on a few documents */
|
||||
public void testSomeDocuments() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig();
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
IndexWriter writer = new IndexWriter(dir, iwc);
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(new TextField("field", "foo", Field.Store.NO));
|
||||
writer.addDocument(doc);
|
||||
|
||||
writer.addDocument(new Document());
|
||||
|
||||
doc = new Document();
|
||||
doc.add(new TextField("field", "foo bar", Field.Store.NO));
|
||||
writer.addDocument(doc);
|
||||
|
||||
doc = new Document();
|
||||
doc.add(new TextField("field", "bar", Field.Store.NO));
|
||||
writer.addDocument(doc);
|
||||
|
||||
DirectoryReader ir = DirectoryReader.open(writer, true);
|
||||
writer.close();
|
||||
|
||||
PostingsEnum p1 = getOnlySegmentReader(ir).postings(new Term("field", "foo"), PostingsEnum.POSITIONS);
|
||||
PostingsEnum p2 = getOnlySegmentReader(ir).postings(new Term("field", "bar"), PostingsEnum.POSITIONS);
|
||||
PostingsEnum union = new MultiPhraseQuery.UnionPostingsEnum(Arrays.asList(p1, p2));
|
||||
|
||||
assertEquals(-1, union.docID());
|
||||
|
||||
assertEquals(0, union.nextDoc());
|
||||
assertEquals(1, union.freq());
|
||||
assertEquals(0, union.nextPosition());
|
||||
|
||||
assertEquals(2, union.nextDoc());
|
||||
assertEquals(2, union.freq());
|
||||
assertEquals(0, union.nextPosition());
|
||||
assertEquals(1, union.nextPosition());
|
||||
|
||||
assertEquals(3, union.nextDoc());
|
||||
assertEquals(1, union.freq());
|
||||
assertEquals(0, union.nextPosition());
|
||||
|
||||
assertEquals(DocIdSetIterator.NO_MORE_DOCS, union.nextDoc());
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
|
@ -275,7 +275,7 @@ public abstract class SearchEquivalenceTestBase extends LuceneTestCase {
|
|||
// not efficient, but simple!
|
||||
TopDocs td1 = s1.search(q1, reader.maxDoc(), sort);
|
||||
TopDocs td2 = s2.search(q2, reader.maxDoc(), sort);
|
||||
assertTrue(td1.totalHits <= td2.totalHits);
|
||||
assertTrue("too many hits: " + td1.totalHits + " > " + td2.totalHits, td1.totalHits <= td2.totalHits);
|
||||
|
||||
// fill the superset into a bitset
|
||||
BitSet bitset = new BitSet();
|
||||
|
|
Loading…
Reference in New Issue