mirror of https://github.com/apache/lucene.git
LUCENE-6198: Two-phase execution for phrase queries and conjunctions.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1659599 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
82eff4eb4d
commit
5b4c02a3a1
|
@ -81,6 +81,11 @@ Optimizations
|
|||
* LUCENE-6233 Speed up CheckIndex when the index has term vectors
|
||||
(Robert Muir, Mike McCandless)
|
||||
|
||||
* LUCENE-6198: Added the TwoPhaseDocIdSetIterator API, exposed on scorers which
|
||||
is for now only used on phrase queries and conjunctions in order to check
|
||||
positions lazily if the phrase query is in a conjunction with other queries.
|
||||
(Robert Muir, Adrien Grand)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-6204, LUCENE-6208: Simplify CompoundFormat: remove files()
|
||||
|
|
|
@ -0,0 +1,202 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.util.CollectionUtil;
|
||||
|
||||
class ConjunctionDISI extends DocIdSetIterator {
|
||||
|
||||
/** Create a conjunction over the provided iterators, taking advantage of
|
||||
* {@link TwoPhaseDocIdSetIterator}. */
|
||||
public static ConjunctionDISI intersect(List<? extends DocIdSetIterator> iterators) {
|
||||
final List<DocIdSetIterator> allIterators = new ArrayList<>();
|
||||
final List<TwoPhaseDocIdSetIterator> twoPhaseIterators = new ArrayList<>();
|
||||
for (DocIdSetIterator iterator : iterators) {
|
||||
if (iterator instanceof Scorer) {
|
||||
// if we have a scorer, check if it supports two-phase iteration
|
||||
TwoPhaseDocIdSetIterator twoPhaseIterator = ((Scorer) iterator).asTwoPhaseIterator();
|
||||
if (twoPhaseIterator != null) {
|
||||
// Note:
|
||||
allIterators.add(twoPhaseIterator.approximation());
|
||||
twoPhaseIterators.add(twoPhaseIterator);
|
||||
} else {
|
||||
allIterators.add(iterator);
|
||||
}
|
||||
} else {
|
||||
// no approximation support, use the iterator as-is
|
||||
allIterators.add(iterator);
|
||||
}
|
||||
}
|
||||
|
||||
if (twoPhaseIterators.isEmpty()) {
|
||||
return new ConjunctionDISI(allIterators);
|
||||
} else {
|
||||
return new TwoPhase(allIterators, twoPhaseIterators);
|
||||
}
|
||||
}
|
||||
|
||||
final DocIdSetIterator lead;
|
||||
final DocIdSetIterator[] others;
|
||||
|
||||
ConjunctionDISI(List<? extends DocIdSetIterator> iterators) {
|
||||
// Sort the array the first time to allow the least frequent DocsEnum to
|
||||
// lead the matching.
|
||||
CollectionUtil.timSort(iterators, new Comparator<DocIdSetIterator>() {
|
||||
@Override
|
||||
public int compare(DocIdSetIterator o1, DocIdSetIterator o2) {
|
||||
return Long.compare(o1.cost(), o2.cost());
|
||||
}
|
||||
});
|
||||
lead = iterators.get(0);
|
||||
others = iterators.subList(1, iterators.size()).toArray(new DocIdSetIterator[0]);
|
||||
}
|
||||
|
||||
protected boolean matches() throws IOException {
|
||||
return true;
|
||||
}
|
||||
|
||||
TwoPhaseDocIdSetIterator asTwoPhaseIterator() {
|
||||
return null;
|
||||
}
|
||||
|
||||
private int doNext(int doc) throws IOException {
|
||||
for(;;) {
|
||||
|
||||
if (doc == NO_MORE_DOCS) {
|
||||
// we need this check because it is only ok to call #matches when positioned
|
||||
return NO_MORE_DOCS;
|
||||
}
|
||||
|
||||
advanceHead: for(;;) {
|
||||
for (DocIdSetIterator other : others) {
|
||||
// invariant: docsAndFreqs[i].doc <= doc at this point.
|
||||
|
||||
// docsAndFreqs[i].doc may already be equal to doc if we "broke advanceHead"
|
||||
// on the previous iteration and the advance on the lead scorer exactly matched.
|
||||
if (other.docID() < doc) {
|
||||
final int next = other.advance(doc);
|
||||
|
||||
if (next > doc) {
|
||||
// DocsEnum beyond the current doc - break and advance lead to the new highest doc.
|
||||
doc = lead.advance(next);
|
||||
break advanceHead;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (matches()) {
|
||||
// success - all DocsEnums are on the same doc
|
||||
return doc;
|
||||
} else {
|
||||
doc = lead.nextDoc();
|
||||
break advanceHead;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
return doNext(lead.advance(target));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return lead.docID();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
return doNext(lead.nextDoc());
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return lead.cost();
|
||||
}
|
||||
|
||||
/**
|
||||
* {@link TwoPhaseDocIdSetIterator} view of a {@link TwoPhase} conjunction.
|
||||
*/
|
||||
private static class TwoPhaseConjunctionDISI extends TwoPhaseDocIdSetIterator {
|
||||
|
||||
private final ConjunctionDISI approximation;
|
||||
private final TwoPhaseDocIdSetIterator[] twoPhaseIterators;
|
||||
|
||||
private TwoPhaseConjunctionDISI(List<? extends DocIdSetIterator> iterators, List<TwoPhaseDocIdSetIterator> twoPhaseIterators) {
|
||||
approximation = new ConjunctionDISI(iterators);
|
||||
assert twoPhaseIterators.size() > 0;
|
||||
this.twoPhaseIterators = twoPhaseIterators.toArray(new TwoPhaseDocIdSetIterator[0]);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocIdSetIterator approximation() {
|
||||
return approximation;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean matches() throws IOException {
|
||||
for (TwoPhaseDocIdSetIterator twoPhaseIterator : twoPhaseIterators) {
|
||||
if (twoPhaseIterator.matches() == false) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* A conjunction DISI built on top of approximations. This implementation
|
||||
* verifies that documents actually match by consulting the provided
|
||||
* {@link TwoPhaseDocIdSetIterator}s.
|
||||
*
|
||||
* Another important difference with {@link ConjunctionDISI} is that this
|
||||
* implementation supports approximations too: the approximation of this
|
||||
* impl is the conjunction of the approximations of the wrapped iterators.
|
||||
* This allows eg. {@code +"A B" +C} to be approximated as
|
||||
* {@code +(+A +B) +C}.
|
||||
*/
|
||||
// NOTE: this is essentially the same as TwoPhaseDocIdSetIterator.asDocIdSetIterator
|
||||
// but is its own impl in order to be able to expose a two-phase view
|
||||
private static class TwoPhase extends ConjunctionDISI {
|
||||
|
||||
final TwoPhaseConjunctionDISI twoPhaseView;
|
||||
|
||||
private TwoPhase(List<? extends DocIdSetIterator> iterators, List<TwoPhaseDocIdSetIterator> twoPhaseIterators) {
|
||||
super(iterators);
|
||||
twoPhaseView = new TwoPhaseConjunctionDISI(iterators, twoPhaseIterators);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TwoPhaseConjunctionDISI asTwoPhaseIterator() {
|
||||
return twoPhaseView;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean matches() throws IOException {
|
||||
return twoPhaseView.matches();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -20,18 +20,14 @@ package org.apache.lucene.search;
|
|||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/** Scorer for conjunctions, sets of queries, all of which are required. */
|
||||
class ConjunctionScorer extends Scorer {
|
||||
|
||||
protected int lastDoc = -1;
|
||||
protected final DocsAndFreqs[] docsAndFreqs;
|
||||
private final DocsAndFreqs lead;
|
||||
private final ConjunctionDISI disi;
|
||||
private final Scorer[] scorers;
|
||||
private final float coord;
|
||||
|
||||
|
@ -44,68 +40,28 @@ class ConjunctionScorer extends Scorer {
|
|||
super(weight);
|
||||
assert required.containsAll(scorers);
|
||||
this.coord = coord;
|
||||
this.docsAndFreqs = new DocsAndFreqs[required.size()];
|
||||
for (int i = 0; i < required.size(); ++i) {
|
||||
docsAndFreqs[i] = new DocsAndFreqs(required.get(i));
|
||||
}
|
||||
// Sort the array the first time to allow the least frequent DocsEnum to
|
||||
// lead the matching.
|
||||
ArrayUtil.timSort(docsAndFreqs, new Comparator<DocsAndFreqs>() {
|
||||
@Override
|
||||
public int compare(DocsAndFreqs o1, DocsAndFreqs o2) {
|
||||
return Long.compare(o1.cost, o2.cost);
|
||||
}
|
||||
});
|
||||
|
||||
lead = docsAndFreqs[0]; // least frequent DocsEnum leads the intersection
|
||||
|
||||
this.disi = ConjunctionDISI.intersect(required);
|
||||
this.scorers = scorers.toArray(new Scorer[scorers.size()]);
|
||||
}
|
||||
|
||||
private int doNext(int doc) throws IOException {
|
||||
for(;;) {
|
||||
// doc may already be NO_MORE_DOCS here, but we don't check explicitly
|
||||
// since all scorers should advance to NO_MORE_DOCS, match, then
|
||||
// return that value.
|
||||
advanceHead: for(;;) {
|
||||
for (int i = 1; i < docsAndFreqs.length; i++) {
|
||||
// invariant: docsAndFreqs[i].doc <= doc at this point.
|
||||
|
||||
// docsAndFreqs[i].doc may already be equal to doc if we "broke advanceHead"
|
||||
// on the previous iteration and the advance on the lead scorer exactly matched.
|
||||
if (docsAndFreqs[i].doc < doc) {
|
||||
docsAndFreqs[i].doc = docsAndFreqs[i].iterator.advance(doc);
|
||||
|
||||
if (docsAndFreqs[i].doc > doc) {
|
||||
// DocsEnum beyond the current doc - break and advance lead to the new highest doc.
|
||||
doc = docsAndFreqs[i].doc;
|
||||
break advanceHead;
|
||||
}
|
||||
}
|
||||
}
|
||||
// success - all DocsEnums are on the same doc
|
||||
return doc;
|
||||
}
|
||||
// advance head for next iteration
|
||||
doc = lead.doc = lead.iterator.advance(doc);
|
||||
}
|
||||
@Override
|
||||
public TwoPhaseDocIdSetIterator asTwoPhaseIterator() {
|
||||
return disi.asTwoPhaseIterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
lead.doc = lead.iterator.advance(target);
|
||||
return lastDoc = doNext(lead.doc);
|
||||
return disi.advance(target);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return lastDoc;
|
||||
return disi.docID();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
lead.doc = lead.iterator.nextDoc();
|
||||
return lastDoc = doNext(lead.doc);
|
||||
return disi.nextDoc();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -120,7 +76,7 @@ class ConjunctionScorer extends Scorer {
|
|||
|
||||
@Override
|
||||
public int freq() {
|
||||
return docsAndFreqs.length;
|
||||
return scorers.length;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -145,12 +101,12 @@ class ConjunctionScorer extends Scorer {
|
|||
|
||||
@Override
|
||||
public long cost() {
|
||||
return lead.iterator.cost();
|
||||
return disi.cost();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<ChildScorer> getChildren() {
|
||||
ArrayList<ChildScorer> children = new ArrayList<>(docsAndFreqs.length);
|
||||
ArrayList<ChildScorer> children = new ArrayList<>();
|
||||
for (Scorer scorer : scorers) {
|
||||
children.add(new ChildScorer(scorer, "MUST"));
|
||||
}
|
||||
|
|
|
@ -18,9 +18,11 @@ package org.apache.lucene.search;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
|
@ -49,10 +51,11 @@ final class ExactPhraseScorer extends Scorer {
|
|||
}
|
||||
}
|
||||
|
||||
private final ConjunctionDISI conjunction;
|
||||
|
||||
private final ChunkState[] chunkStates;
|
||||
private final PostingsEnum lead;
|
||||
|
||||
private int docID = -1;
|
||||
private int freq;
|
||||
|
||||
private final Similarity.SimScorer docScorer;
|
||||
|
@ -72,49 +75,46 @@ final class ExactPhraseScorer extends Scorer {
|
|||
// min(cost)
|
||||
cost = lead.cost();
|
||||
|
||||
List<DocIdSetIterator> iterators = new ArrayList<>();
|
||||
for(int i=0;i<postings.length;i++) {
|
||||
chunkStates[i] = new ChunkState(postings[i].postings, -postings[i].position);
|
||||
iterators.add(postings[i].postings);
|
||||
}
|
||||
conjunction = ConjunctionDISI.intersect(iterators);
|
||||
}
|
||||
|
||||
private int doNext(int doc) throws IOException {
|
||||
for(;;) {
|
||||
// TODO: don't dup this logic from conjunctionscorer :)
|
||||
advanceHead: for(;;) {
|
||||
for (int i = 1; i < chunkStates.length; i++) {
|
||||
final PostingsEnum de = chunkStates[i].posEnum;
|
||||
if (de.docID() < doc) {
|
||||
int d = de.advance(doc);
|
||||
|
||||
if (d > doc) {
|
||||
// DocsEnum beyond the current doc - break and advance lead to the new highest doc.
|
||||
doc = d;
|
||||
break advanceHead;
|
||||
}
|
||||
}
|
||||
}
|
||||
// all DocsEnums are on the same doc
|
||||
if (doc == NO_MORE_DOCS) {
|
||||
return doc;
|
||||
} else if (phraseFreq() > 0) {
|
||||
return doc; // success: matches phrase
|
||||
} else {
|
||||
doc = lead.nextDoc(); // doesn't match phrase
|
||||
}
|
||||
@Override
|
||||
public TwoPhaseDocIdSetIterator asTwoPhaseIterator() {
|
||||
return new TwoPhaseDocIdSetIterator() {
|
||||
|
||||
@Override
|
||||
public boolean matches() throws IOException {
|
||||
return phraseFreq() > 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocIdSetIterator approximation() {
|
||||
return conjunction;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private int doNext(int doc) throws IOException {
|
||||
for (;; doc = conjunction.nextDoc()) {
|
||||
if (doc == NO_MORE_DOCS || phraseFreq() > 0) {
|
||||
return doc;
|
||||
}
|
||||
// advance head for next iteration
|
||||
doc = lead.advance(doc);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
return docID = doNext(lead.nextDoc());
|
||||
return doNext(conjunction.nextDoc());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
return docID = doNext(lead.advance(target));
|
||||
return doNext(conjunction.advance(target));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -149,12 +149,12 @@ final class ExactPhraseScorer extends Scorer {
|
|||
|
||||
@Override
|
||||
public int docID() {
|
||||
return docID;
|
||||
return conjunction.docID();
|
||||
}
|
||||
|
||||
@Override
|
||||
public float score() {
|
||||
return docScorer.score(docID, freq);
|
||||
return docScorer.score(docID(), freq);
|
||||
}
|
||||
|
||||
private int phraseFreq() throws IOException {
|
||||
|
|
|
@ -60,7 +60,7 @@ public abstract class Scorer extends PostingsEnum {
|
|||
* {@link LeafCollector#collect}.
|
||||
*/
|
||||
public abstract float score() throws IOException;
|
||||
|
||||
|
||||
/** returns parent Weight
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
@ -99,4 +99,23 @@ public abstract class Scorer extends PostingsEnum {
|
|||
this.relationship = relationship;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Optional method: Return a {@link TwoPhaseDocIdSetIterator} view of this
|
||||
* {@link Scorer}. A return value of {@code null} indicates that
|
||||
* two-phase iteration is not supported.
|
||||
*
|
||||
* Note that the returned {@link TwoPhaseDocIdSetIterator}'s
|
||||
* {@link TwoPhaseDocIdSetIterator#approximation() approximation} must
|
||||
* advance synchronously with this iterator: advancing the approximation must
|
||||
* advance this iterator and vice-versa.
|
||||
*
|
||||
* Implementing this method is typically useful on {@link Scorer}s
|
||||
* that have a high per-document overhead in order to confirm matches.
|
||||
*
|
||||
* The default implementation returns {@code null}.
|
||||
*/
|
||||
public TwoPhaseDocIdSetIterator asTwoPhaseIterator() {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,81 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* An approximation of a {@link DocIdSetIterator}. When the {@link #approximation()}'s
|
||||
* {@link DocIdSetIterator#nextDoc()} or {@link DocIdSetIterator#advance(int)}
|
||||
* return, {@link #matches()} needs to be checked in order to know whether the
|
||||
* returned doc ID actually matches.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class TwoPhaseDocIdSetIterator {
|
||||
|
||||
/** Return a {@link DocIdSetIterator} view of the provided
|
||||
* {@link TwoPhaseDocIdSetIterator}. */
|
||||
public static DocIdSetIterator asDocIdSetIterator(TwoPhaseDocIdSetIterator twoPhaseIterator) {
|
||||
final DocIdSetIterator approximation = twoPhaseIterator.approximation();
|
||||
return new DocIdSetIterator() {
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return approximation.docID();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
return doNext(approximation.nextDoc());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
return doNext(approximation.advance(target));
|
||||
}
|
||||
|
||||
private int doNext(int doc) throws IOException {
|
||||
for (;; doc = approximation.nextDoc()) {
|
||||
if (doc == NO_MORE_DOCS) {
|
||||
return NO_MORE_DOCS;
|
||||
} else if (twoPhaseIterator.matches()) {
|
||||
return doc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return approximation.cost();
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
/** Return an approximation. The returned {@link DocIdSetIterator} is a
|
||||
* superset of the matching documents, and each match needs to be confirmed
|
||||
* with {@link #matches()} in order to know whether it matches or not. */
|
||||
public abstract DocIdSetIterator approximation();
|
||||
|
||||
/** Return whether the current doc ID that the iterator is on matches. This
|
||||
* method should only be called when the iterator is positionned, ie. not
|
||||
* when {@link DocIdSetIterator#docID()} is {@code -1} or
|
||||
* {@link DocIdSetIterator#NO_MORE_DOCS}. */
|
||||
public abstract boolean matches() throws IOException;
|
||||
|
||||
}
|
|
@ -590,4 +590,33 @@ public class TestBooleanQuery extends LuceneTestCase {
|
|||
w.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testConjunctionSupportsApproximations() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||
Document doc = new Document();
|
||||
Field f = newTextField("field", "a b c", Field.Store.NO);
|
||||
doc.add(f);
|
||||
w.addDocument(doc);
|
||||
w.commit();
|
||||
|
||||
DirectoryReader reader = w.getReader();
|
||||
final IndexSearcher searcher = new IndexSearcher(reader);
|
||||
|
||||
PhraseQuery pq = new PhraseQuery();
|
||||
pq.add(new Term("field", "a"));
|
||||
pq.add(new Term("field", "b"));
|
||||
|
||||
BooleanQuery q = new BooleanQuery();
|
||||
q.add(pq, Occur.MUST);
|
||||
q.add(new TermQuery(new Term("field", "c")), Occur.FILTER);
|
||||
|
||||
final Weight weight = searcher.createNormalizedWeight(q, random().nextBoolean());
|
||||
final Scorer scorer = weight.scorer(reader.leaves().get(0), null);
|
||||
assertNotNull(scorer.asTwoPhaseIterator());
|
||||
|
||||
reader.close();
|
||||
w.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,273 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.util.BitDocIdSet;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
public class TestConjunctionDISI extends LuceneTestCase {
|
||||
|
||||
private static TwoPhaseDocIdSetIterator approximation(final DocIdSetIterator iterator, final FixedBitSet confirmed) {
|
||||
return new TwoPhaseDocIdSetIterator() {
|
||||
|
||||
@Override
|
||||
public DocIdSetIterator approximation() {
|
||||
return iterator;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean matches() throws IOException {
|
||||
return confirmed.get(iterator.docID());
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private static Scorer scorer(TwoPhaseDocIdSetIterator twoPhaseIterator) {
|
||||
return scorer(TwoPhaseDocIdSetIterator.asDocIdSetIterator(twoPhaseIterator), twoPhaseIterator);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a {@link Scorer} that wraps the given {@link DocIdSetIterator}. It
|
||||
* also accepts a {@link TwoPhaseDocIdSetIterator} view, which is exposed in
|
||||
* {@link Scorer#asTwoPhaseIterator()}. When the two-phase view is not null,
|
||||
* then {@link Scorer#nextDoc()} and {@link Scorer#advance(int)} will raise
|
||||
* an exception in order to make sure that {@link ConjunctionDISI} takes
|
||||
* advantage of the {@link TwoPhaseDocIdSetIterator} view.
|
||||
*/
|
||||
private static Scorer scorer(DocIdSetIterator it, TwoPhaseDocIdSetIterator twoPhaseIterator) {
|
||||
return new Scorer(null) {
|
||||
|
||||
@Override
|
||||
public TwoPhaseDocIdSetIterator asTwoPhaseIterator() {
|
||||
return twoPhaseIterator;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
if (twoPhaseIterator != null) {
|
||||
throw new UnsupportedOperationException("ConjunctionDISI should call the two-phase iterator");
|
||||
}
|
||||
return it.docID();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
if (twoPhaseIterator != null) {
|
||||
throw new UnsupportedOperationException("ConjunctionDISI should call the two-phase iterator");
|
||||
}
|
||||
return it.nextDoc();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
if (twoPhaseIterator != null) {
|
||||
throw new UnsupportedOperationException("ConjunctionDISI should call the two-phase iterator");
|
||||
}
|
||||
return it.advance(target);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
if (twoPhaseIterator != null) {
|
||||
throw new UnsupportedOperationException("ConjunctionDISI should call the two-phase iterator");
|
||||
}
|
||||
return it.cost();
|
||||
}
|
||||
|
||||
@Override
|
||||
public float score() throws IOException {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int freq() throws IOException {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextPosition() throws IOException {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() throws IOException {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef getPayload() throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
private static FixedBitSet randomSet(int maxDoc) {
|
||||
final int step = TestUtil.nextInt(random(), 1, 10);
|
||||
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||
for (int doc = random().nextInt(step); doc < maxDoc; doc += TestUtil.nextInt(random(), 1, step)) {
|
||||
set.set(doc);
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
private static FixedBitSet clearRandomBits(FixedBitSet other) {
|
||||
final FixedBitSet set = new FixedBitSet(other.length());
|
||||
set.or(other);
|
||||
for (int i = 0; i < set.length(); ++i) {
|
||||
if (random().nextBoolean()) {
|
||||
set.clear(i);
|
||||
}
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
private static FixedBitSet intersect(FixedBitSet[] bitSets) {
|
||||
final FixedBitSet intersection = new FixedBitSet(bitSets[0].length());
|
||||
intersection.or(bitSets[0]);
|
||||
for (int i = 1; i < bitSets.length; ++i) {
|
||||
intersection.and(bitSets[i]);
|
||||
}
|
||||
return intersection;
|
||||
}
|
||||
|
||||
private static FixedBitSet toBitSet(int maxDoc, DocIdSetIterator iterator) throws IOException {
|
||||
final FixedBitSet set = new FixedBitSet(maxDoc);
|
||||
for (int doc = iterator.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iterator.nextDoc()) {
|
||||
set.set(doc);
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
// Test that the conjunction iterator is correct
|
||||
public void testConjunction() throws IOException {
|
||||
final int iters = atLeast(100);
|
||||
for (int iter = 0; iter < iters; ++iter) {
|
||||
final int maxDoc = TestUtil.nextInt(random(), 100, 10000);
|
||||
final int numIterators = TestUtil.nextInt(random(), 2, 5);
|
||||
final FixedBitSet[] sets = new FixedBitSet[numIterators];
|
||||
final DocIdSetIterator[] iterators = new DocIdSetIterator[numIterators];
|
||||
for (int i = 0; i < iterators.length; ++i) {
|
||||
final FixedBitSet set = randomSet(maxDoc);
|
||||
if (random().nextBoolean()) {
|
||||
// simple iterator
|
||||
sets[i] = set;
|
||||
iterators[i] = new BitDocIdSet(set).iterator();
|
||||
} else {
|
||||
// scorer with approximation
|
||||
final FixedBitSet confirmed = clearRandomBits(set);
|
||||
sets[i] = confirmed;
|
||||
final TwoPhaseDocIdSetIterator approximation = approximation(new BitDocIdSet(set).iterator(), confirmed);
|
||||
iterators[i] = scorer(approximation);
|
||||
}
|
||||
}
|
||||
|
||||
final ConjunctionDISI conjunction = ConjunctionDISI.intersect(Arrays.asList(iterators));
|
||||
assertEquals(intersect(sets), toBitSet(maxDoc, conjunction));
|
||||
}
|
||||
}
|
||||
|
||||
// Test that the conjunction approximation is correct
|
||||
public void testConjunctionApproximation() throws IOException {
|
||||
final int iters = atLeast(100);
|
||||
for (int iter = 0; iter < iters; ++iter) {
|
||||
final int maxDoc = TestUtil.nextInt(random(), 100, 10000);
|
||||
final int numIterators = TestUtil.nextInt(random(), 2, 5);
|
||||
final FixedBitSet[] sets = new FixedBitSet[numIterators];
|
||||
final DocIdSetIterator[] iterators = new DocIdSetIterator[numIterators];
|
||||
boolean hasApproximation = false;
|
||||
for (int i = 0; i < iterators.length; ++i) {
|
||||
final FixedBitSet set = randomSet(maxDoc);
|
||||
if (random().nextBoolean()) {
|
||||
// simple iterator
|
||||
sets[i] = set;
|
||||
iterators[i] = new BitDocIdSet(set).iterator();
|
||||
} else {
|
||||
// scorer with approximation
|
||||
final FixedBitSet confirmed = clearRandomBits(set);
|
||||
sets[i] = confirmed;
|
||||
final TwoPhaseDocIdSetIterator approximation = approximation(new BitDocIdSet(set).iterator(), confirmed);
|
||||
iterators[i] = scorer(approximation);
|
||||
hasApproximation = true;
|
||||
}
|
||||
}
|
||||
|
||||
final ConjunctionDISI conjunction = ConjunctionDISI.intersect(Arrays.asList(iterators));
|
||||
TwoPhaseDocIdSetIterator twoPhaseIterator = conjunction.asTwoPhaseIterator();
|
||||
assertEquals(hasApproximation, twoPhaseIterator != null);
|
||||
if (hasApproximation) {
|
||||
assertEquals(intersect(sets), toBitSet(maxDoc, TwoPhaseDocIdSetIterator.asDocIdSetIterator(twoPhaseIterator)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This test makes sure that when nesting scorers with ConjunctionDISI, confirmations are pushed to the root.
|
||||
public void testRecursiveConjunctionApproximation() throws IOException {
|
||||
final int iters = atLeast(100);
|
||||
for (int iter = 0; iter < iters; ++iter) {
|
||||
final int maxDoc = TestUtil.nextInt(random(), 100, 10000);
|
||||
final int numIterators = TestUtil.nextInt(random(), 2, 5);
|
||||
final FixedBitSet[] sets = new FixedBitSet[numIterators];
|
||||
DocIdSetIterator conjunction = null;
|
||||
boolean hasApproximation = false;
|
||||
for (int i = 0; i < numIterators; ++i) {
|
||||
final FixedBitSet set = randomSet(maxDoc);
|
||||
final DocIdSetIterator newIterator;
|
||||
if (random().nextBoolean()) {
|
||||
// simple iterator
|
||||
sets[i] = set;
|
||||
newIterator = new BitDocIdSet(set).iterator();
|
||||
} else {
|
||||
// scorer with approximation
|
||||
final FixedBitSet confirmed = clearRandomBits(set);
|
||||
sets[i] = confirmed;
|
||||
final TwoPhaseDocIdSetIterator approximation = approximation(new BitDocIdSet(set).iterator(), confirmed);
|
||||
newIterator = scorer(approximation);
|
||||
hasApproximation = true;
|
||||
}
|
||||
|
||||
if (conjunction == null) {
|
||||
conjunction = newIterator;
|
||||
} else {
|
||||
final ConjunctionDISI conj = ConjunctionDISI.intersect(Arrays.asList(conjunction, newIterator));
|
||||
conjunction = scorer(conj, conj.asTwoPhaseIterator());
|
||||
}
|
||||
}
|
||||
|
||||
TwoPhaseDocIdSetIterator twoPhaseIterator = ((Scorer) conjunction).asTwoPhaseIterator();
|
||||
assertEquals(hasApproximation, twoPhaseIterator != null);
|
||||
if (hasApproximation) {
|
||||
assertEquals(intersect(sets), toBitSet(maxDoc, TwoPhaseDocIdSetIterator.asDocIdSetIterator(twoPhaseIterator)));
|
||||
} else {
|
||||
assertEquals(intersect(sets), toBitSet(maxDoc, conjunction));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue