LUCENE-6198: Two-phase execution for phrase queries and conjunctions.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1659599 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2015-02-13 16:45:06 +00:00
parent 82eff4eb4d
commit 5b4c02a3a1
8 changed files with 653 additions and 88 deletions

View File

@ -81,6 +81,11 @@ Optimizations
* LUCENE-6233 Speed up CheckIndex when the index has term vectors * LUCENE-6233 Speed up CheckIndex when the index has term vectors
(Robert Muir, Mike McCandless) (Robert Muir, Mike McCandless)
* LUCENE-6198: Added the TwoPhaseDocIdSetIterator API, exposed on scorers which
is for now only used on phrase queries and conjunctions in order to check
positions lazily if the phrase query is in a conjunction with other queries.
(Robert Muir, Adrien Grand)
API Changes API Changes
* LUCENE-6204, LUCENE-6208: Simplify CompoundFormat: remove files() * LUCENE-6204, LUCENE-6208: Simplify CompoundFormat: remove files()

View File

@ -0,0 +1,202 @@
package org.apache.lucene.search;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.util.CollectionUtil;
class ConjunctionDISI extends DocIdSetIterator {
/** Create a conjunction over the provided iterators, taking advantage of
* {@link TwoPhaseDocIdSetIterator}. */
public static ConjunctionDISI intersect(List<? extends DocIdSetIterator> iterators) {
final List<DocIdSetIterator> allIterators = new ArrayList<>();
final List<TwoPhaseDocIdSetIterator> twoPhaseIterators = new ArrayList<>();
for (DocIdSetIterator iterator : iterators) {
if (iterator instanceof Scorer) {
// if we have a scorer, check if it supports two-phase iteration
TwoPhaseDocIdSetIterator twoPhaseIterator = ((Scorer) iterator).asTwoPhaseIterator();
if (twoPhaseIterator != null) {
// Note:
allIterators.add(twoPhaseIterator.approximation());
twoPhaseIterators.add(twoPhaseIterator);
} else {
allIterators.add(iterator);
}
} else {
// no approximation support, use the iterator as-is
allIterators.add(iterator);
}
}
if (twoPhaseIterators.isEmpty()) {
return new ConjunctionDISI(allIterators);
} else {
return new TwoPhase(allIterators, twoPhaseIterators);
}
}
final DocIdSetIterator lead;
final DocIdSetIterator[] others;
ConjunctionDISI(List<? extends DocIdSetIterator> iterators) {
// Sort the array the first time to allow the least frequent DocsEnum to
// lead the matching.
CollectionUtil.timSort(iterators, new Comparator<DocIdSetIterator>() {
@Override
public int compare(DocIdSetIterator o1, DocIdSetIterator o2) {
return Long.compare(o1.cost(), o2.cost());
}
});
lead = iterators.get(0);
others = iterators.subList(1, iterators.size()).toArray(new DocIdSetIterator[0]);
}
protected boolean matches() throws IOException {
return true;
}
TwoPhaseDocIdSetIterator asTwoPhaseIterator() {
return null;
}
private int doNext(int doc) throws IOException {
for(;;) {
if (doc == NO_MORE_DOCS) {
// we need this check because it is only ok to call #matches when positioned
return NO_MORE_DOCS;
}
advanceHead: for(;;) {
for (DocIdSetIterator other : others) {
// invariant: docsAndFreqs[i].doc <= doc at this point.
// docsAndFreqs[i].doc may already be equal to doc if we "broke advanceHead"
// on the previous iteration and the advance on the lead scorer exactly matched.
if (other.docID() < doc) {
final int next = other.advance(doc);
if (next > doc) {
// DocsEnum beyond the current doc - break and advance lead to the new highest doc.
doc = lead.advance(next);
break advanceHead;
}
}
}
if (matches()) {
// success - all DocsEnums are on the same doc
return doc;
} else {
doc = lead.nextDoc();
break advanceHead;
}
}
}
}
@Override
public int advance(int target) throws IOException {
return doNext(lead.advance(target));
}
@Override
public int docID() {
return lead.docID();
}
@Override
public int nextDoc() throws IOException {
return doNext(lead.nextDoc());
}
@Override
public long cost() {
return lead.cost();
}
/**
* {@link TwoPhaseDocIdSetIterator} view of a {@link TwoPhase} conjunction.
*/
private static class TwoPhaseConjunctionDISI extends TwoPhaseDocIdSetIterator {
private final ConjunctionDISI approximation;
private final TwoPhaseDocIdSetIterator[] twoPhaseIterators;
private TwoPhaseConjunctionDISI(List<? extends DocIdSetIterator> iterators, List<TwoPhaseDocIdSetIterator> twoPhaseIterators) {
approximation = new ConjunctionDISI(iterators);
assert twoPhaseIterators.size() > 0;
this.twoPhaseIterators = twoPhaseIterators.toArray(new TwoPhaseDocIdSetIterator[0]);
}
@Override
public DocIdSetIterator approximation() {
return approximation;
}
@Override
public boolean matches() throws IOException {
for (TwoPhaseDocIdSetIterator twoPhaseIterator : twoPhaseIterators) {
if (twoPhaseIterator.matches() == false) {
return false;
}
}
return true;
}
}
/**
* A conjunction DISI built on top of approximations. This implementation
* verifies that documents actually match by consulting the provided
* {@link TwoPhaseDocIdSetIterator}s.
*
* Another important difference with {@link ConjunctionDISI} is that this
* implementation supports approximations too: the approximation of this
* impl is the conjunction of the approximations of the wrapped iterators.
* This allows eg. {@code +"A B" +C} to be approximated as
* {@code +(+A +B) +C}.
*/
// NOTE: this is essentially the same as TwoPhaseDocIdSetIterator.asDocIdSetIterator
// but is its own impl in order to be able to expose a two-phase view
private static class TwoPhase extends ConjunctionDISI {
final TwoPhaseConjunctionDISI twoPhaseView;
private TwoPhase(List<? extends DocIdSetIterator> iterators, List<TwoPhaseDocIdSetIterator> twoPhaseIterators) {
super(iterators);
twoPhaseView = new TwoPhaseConjunctionDISI(iterators, twoPhaseIterators);
}
@Override
public TwoPhaseConjunctionDISI asTwoPhaseIterator() {
return twoPhaseView;
}
@Override
protected boolean matches() throws IOException {
return twoPhaseView.matches();
}
}
}

View File

@ -20,18 +20,14 @@ package org.apache.lucene.search;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.Comparator;
import java.util.List; import java.util.List;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
/** Scorer for conjunctions, sets of queries, all of which are required. */ /** Scorer for conjunctions, sets of queries, all of which are required. */
class ConjunctionScorer extends Scorer { class ConjunctionScorer extends Scorer {
protected int lastDoc = -1; private final ConjunctionDISI disi;
protected final DocsAndFreqs[] docsAndFreqs;
private final DocsAndFreqs lead;
private final Scorer[] scorers; private final Scorer[] scorers;
private final float coord; private final float coord;
@ -44,68 +40,28 @@ class ConjunctionScorer extends Scorer {
super(weight); super(weight);
assert required.containsAll(scorers); assert required.containsAll(scorers);
this.coord = coord; this.coord = coord;
this.docsAndFreqs = new DocsAndFreqs[required.size()]; this.disi = ConjunctionDISI.intersect(required);
for (int i = 0; i < required.size(); ++i) {
docsAndFreqs[i] = new DocsAndFreqs(required.get(i));
}
// Sort the array the first time to allow the least frequent DocsEnum to
// lead the matching.
ArrayUtil.timSort(docsAndFreqs, new Comparator<DocsAndFreqs>() {
@Override
public int compare(DocsAndFreqs o1, DocsAndFreqs o2) {
return Long.compare(o1.cost, o2.cost);
}
});
lead = docsAndFreqs[0]; // least frequent DocsEnum leads the intersection
this.scorers = scorers.toArray(new Scorer[scorers.size()]); this.scorers = scorers.toArray(new Scorer[scorers.size()]);
} }
private int doNext(int doc) throws IOException { @Override
for(;;) { public TwoPhaseDocIdSetIterator asTwoPhaseIterator() {
// doc may already be NO_MORE_DOCS here, but we don't check explicitly return disi.asTwoPhaseIterator();
// since all scorers should advance to NO_MORE_DOCS, match, then
// return that value.
advanceHead: for(;;) {
for (int i = 1; i < docsAndFreqs.length; i++) {
// invariant: docsAndFreqs[i].doc <= doc at this point.
// docsAndFreqs[i].doc may already be equal to doc if we "broke advanceHead"
// on the previous iteration and the advance on the lead scorer exactly matched.
if (docsAndFreqs[i].doc < doc) {
docsAndFreqs[i].doc = docsAndFreqs[i].iterator.advance(doc);
if (docsAndFreqs[i].doc > doc) {
// DocsEnum beyond the current doc - break and advance lead to the new highest doc.
doc = docsAndFreqs[i].doc;
break advanceHead;
}
}
}
// success - all DocsEnums are on the same doc
return doc;
}
// advance head for next iteration
doc = lead.doc = lead.iterator.advance(doc);
}
} }
@Override @Override
public int advance(int target) throws IOException { public int advance(int target) throws IOException {
lead.doc = lead.iterator.advance(target); return disi.advance(target);
return lastDoc = doNext(lead.doc);
} }
@Override @Override
public int docID() { public int docID() {
return lastDoc; return disi.docID();
} }
@Override @Override
public int nextDoc() throws IOException { public int nextDoc() throws IOException {
lead.doc = lead.iterator.nextDoc(); return disi.nextDoc();
return lastDoc = doNext(lead.doc);
} }
@Override @Override
@ -120,7 +76,7 @@ class ConjunctionScorer extends Scorer {
@Override @Override
public int freq() { public int freq() {
return docsAndFreqs.length; return scorers.length;
} }
@Override @Override
@ -145,12 +101,12 @@ class ConjunctionScorer extends Scorer {
@Override @Override
public long cost() { public long cost() {
return lead.iterator.cost(); return disi.cost();
} }
@Override @Override
public Collection<ChildScorer> getChildren() { public Collection<ChildScorer> getChildren() {
ArrayList<ChildScorer> children = new ArrayList<>(docsAndFreqs.length); ArrayList<ChildScorer> children = new ArrayList<>();
for (Scorer scorer : scorers) { for (Scorer scorer : scorers) {
children.add(new ChildScorer(scorer, "MUST")); children.add(new ChildScorer(scorer, "MUST"));
} }

View File

@ -18,9 +18,11 @@ package org.apache.lucene.search;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List;
import org.apache.lucene.index.*; import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
@ -49,10 +51,11 @@ final class ExactPhraseScorer extends Scorer {
} }
} }
private final ConjunctionDISI conjunction;
private final ChunkState[] chunkStates; private final ChunkState[] chunkStates;
private final PostingsEnum lead; private final PostingsEnum lead;
private int docID = -1;
private int freq; private int freq;
private final Similarity.SimScorer docScorer; private final Similarity.SimScorer docScorer;
@ -72,49 +75,46 @@ final class ExactPhraseScorer extends Scorer {
// min(cost) // min(cost)
cost = lead.cost(); cost = lead.cost();
List<DocIdSetIterator> iterators = new ArrayList<>();
for(int i=0;i<postings.length;i++) { for(int i=0;i<postings.length;i++) {
chunkStates[i] = new ChunkState(postings[i].postings, -postings[i].position); chunkStates[i] = new ChunkState(postings[i].postings, -postings[i].position);
iterators.add(postings[i].postings);
} }
conjunction = ConjunctionDISI.intersect(iterators);
} }
private int doNext(int doc) throws IOException {
for(;;) {
// TODO: don't dup this logic from conjunctionscorer :)
advanceHead: for(;;) {
for (int i = 1; i < chunkStates.length; i++) {
final PostingsEnum de = chunkStates[i].posEnum;
if (de.docID() < doc) {
int d = de.advance(doc);
if (d > doc) { @Override
// DocsEnum beyond the current doc - break and advance lead to the new highest doc. public TwoPhaseDocIdSetIterator asTwoPhaseIterator() {
doc = d; return new TwoPhaseDocIdSetIterator() {
break advanceHead;
} @Override
} public boolean matches() throws IOException {
} return phraseFreq() > 0;
// all DocsEnums are on the same doc }
if (doc == NO_MORE_DOCS) {
return doc; @Override
} else if (phraseFreq() > 0) { public DocIdSetIterator approximation() {
return doc; // success: matches phrase return conjunction;
} else { }
doc = lead.nextDoc(); // doesn't match phrase };
} }
private int doNext(int doc) throws IOException {
for (;; doc = conjunction.nextDoc()) {
if (doc == NO_MORE_DOCS || phraseFreq() > 0) {
return doc;
} }
// advance head for next iteration
doc = lead.advance(doc);
} }
} }
@Override @Override
public int nextDoc() throws IOException { public int nextDoc() throws IOException {
return docID = doNext(lead.nextDoc()); return doNext(conjunction.nextDoc());
} }
@Override @Override
public int advance(int target) throws IOException { public int advance(int target) throws IOException {
return docID = doNext(lead.advance(target)); return doNext(conjunction.advance(target));
} }
@Override @Override
@ -149,12 +149,12 @@ final class ExactPhraseScorer extends Scorer {
@Override @Override
public int docID() { public int docID() {
return docID; return conjunction.docID();
} }
@Override @Override
public float score() { public float score() {
return docScorer.score(docID, freq); return docScorer.score(docID(), freq);
} }
private int phraseFreq() throws IOException { private int phraseFreq() throws IOException {

View File

@ -60,7 +60,7 @@ public abstract class Scorer extends PostingsEnum {
* {@link LeafCollector#collect}. * {@link LeafCollector#collect}.
*/ */
public abstract float score() throws IOException; public abstract float score() throws IOException;
/** returns parent Weight /** returns parent Weight
* @lucene.experimental * @lucene.experimental
*/ */
@ -99,4 +99,23 @@ public abstract class Scorer extends PostingsEnum {
this.relationship = relationship; this.relationship = relationship;
} }
} }
/**
* Optional method: Return a {@link TwoPhaseDocIdSetIterator} view of this
* {@link Scorer}. A return value of {@code null} indicates that
* two-phase iteration is not supported.
*
* Note that the returned {@link TwoPhaseDocIdSetIterator}'s
* {@link TwoPhaseDocIdSetIterator#approximation() approximation} must
* advance synchronously with this iterator: advancing the approximation must
* advance this iterator and vice-versa.
*
* Implementing this method is typically useful on {@link Scorer}s
* that have a high per-document overhead in order to confirm matches.
*
* The default implementation returns {@code null}.
*/
public TwoPhaseDocIdSetIterator asTwoPhaseIterator() {
return null;
}
} }

View File

@ -0,0 +1,81 @@
package org.apache.lucene.search;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
/**
* An approximation of a {@link DocIdSetIterator}. When the {@link #approximation()}'s
* {@link DocIdSetIterator#nextDoc()} or {@link DocIdSetIterator#advance(int)}
* return, {@link #matches()} needs to be checked in order to know whether the
* returned doc ID actually matches.
* @lucene.experimental
*/
public abstract class TwoPhaseDocIdSetIterator {
/** Return a {@link DocIdSetIterator} view of the provided
* {@link TwoPhaseDocIdSetIterator}. */
public static DocIdSetIterator asDocIdSetIterator(TwoPhaseDocIdSetIterator twoPhaseIterator) {
final DocIdSetIterator approximation = twoPhaseIterator.approximation();
return new DocIdSetIterator() {
@Override
public int docID() {
return approximation.docID();
}
@Override
public int nextDoc() throws IOException {
return doNext(approximation.nextDoc());
}
@Override
public int advance(int target) throws IOException {
return doNext(approximation.advance(target));
}
private int doNext(int doc) throws IOException {
for (;; doc = approximation.nextDoc()) {
if (doc == NO_MORE_DOCS) {
return NO_MORE_DOCS;
} else if (twoPhaseIterator.matches()) {
return doc;
}
}
}
@Override
public long cost() {
return approximation.cost();
}
};
}
/** Return an approximation. The returned {@link DocIdSetIterator} is a
* superset of the matching documents, and each match needs to be confirmed
* with {@link #matches()} in order to know whether it matches or not. */
public abstract DocIdSetIterator approximation();
/** Return whether the current doc ID that the iterator is on matches. This
* method should only be called when the iterator is positionned, ie. not
* when {@link DocIdSetIterator#docID()} is {@code -1} or
* {@link DocIdSetIterator#NO_MORE_DOCS}. */
public abstract boolean matches() throws IOException;
}

View File

@ -590,4 +590,33 @@ public class TestBooleanQuery extends LuceneTestCase {
w.close(); w.close();
dir.close(); dir.close();
} }
public void testConjunctionSupportsApproximations() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
Document doc = new Document();
Field f = newTextField("field", "a b c", Field.Store.NO);
doc.add(f);
w.addDocument(doc);
w.commit();
DirectoryReader reader = w.getReader();
final IndexSearcher searcher = new IndexSearcher(reader);
PhraseQuery pq = new PhraseQuery();
pq.add(new Term("field", "a"));
pq.add(new Term("field", "b"));
BooleanQuery q = new BooleanQuery();
q.add(pq, Occur.MUST);
q.add(new TermQuery(new Term("field", "c")), Occur.FILTER);
final Weight weight = searcher.createNormalizedWeight(q, random().nextBoolean());
final Scorer scorer = weight.scorer(reader.leaves().get(0), null);
assertNotNull(scorer.asTwoPhaseIterator());
reader.close();
w.close();
dir.close();
}
} }

View File

@ -0,0 +1,273 @@
package org.apache.lucene.search;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.util.BitDocIdSet;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
public class TestConjunctionDISI extends LuceneTestCase {
private static TwoPhaseDocIdSetIterator approximation(final DocIdSetIterator iterator, final FixedBitSet confirmed) {
return new TwoPhaseDocIdSetIterator() {
@Override
public DocIdSetIterator approximation() {
return iterator;
}
@Override
public boolean matches() throws IOException {
return confirmed.get(iterator.docID());
}
};
}
private static Scorer scorer(TwoPhaseDocIdSetIterator twoPhaseIterator) {
return scorer(TwoPhaseDocIdSetIterator.asDocIdSetIterator(twoPhaseIterator), twoPhaseIterator);
}
/**
* Create a {@link Scorer} that wraps the given {@link DocIdSetIterator}. It
* also accepts a {@link TwoPhaseDocIdSetIterator} view, which is exposed in
* {@link Scorer#asTwoPhaseIterator()}. When the two-phase view is not null,
* then {@link Scorer#nextDoc()} and {@link Scorer#advance(int)} will raise
* an exception in order to make sure that {@link ConjunctionDISI} takes
* advantage of the {@link TwoPhaseDocIdSetIterator} view.
*/
private static Scorer scorer(DocIdSetIterator it, TwoPhaseDocIdSetIterator twoPhaseIterator) {
return new Scorer(null) {
@Override
public TwoPhaseDocIdSetIterator asTwoPhaseIterator() {
return twoPhaseIterator;
}
@Override
public int docID() {
if (twoPhaseIterator != null) {
throw new UnsupportedOperationException("ConjunctionDISI should call the two-phase iterator");
}
return it.docID();
}
@Override
public int nextDoc() throws IOException {
if (twoPhaseIterator != null) {
throw new UnsupportedOperationException("ConjunctionDISI should call the two-phase iterator");
}
return it.nextDoc();
}
@Override
public int advance(int target) throws IOException {
if (twoPhaseIterator != null) {
throw new UnsupportedOperationException("ConjunctionDISI should call the two-phase iterator");
}
return it.advance(target);
}
@Override
public long cost() {
if (twoPhaseIterator != null) {
throw new UnsupportedOperationException("ConjunctionDISI should call the two-phase iterator");
}
return it.cost();
}
@Override
public float score() throws IOException {
return 0;
}
@Override
public int freq() throws IOException {
return 0;
}
@Override
public int nextPosition() throws IOException {
return 0;
}
@Override
public int startOffset() throws IOException {
return 0;
}
@Override
public int endOffset() throws IOException {
return 0;
}
@Override
public BytesRef getPayload() throws IOException {
return null;
}
};
}
private static FixedBitSet randomSet(int maxDoc) {
final int step = TestUtil.nextInt(random(), 1, 10);
FixedBitSet set = new FixedBitSet(maxDoc);
for (int doc = random().nextInt(step); doc < maxDoc; doc += TestUtil.nextInt(random(), 1, step)) {
set.set(doc);
}
return set;
}
private static FixedBitSet clearRandomBits(FixedBitSet other) {
final FixedBitSet set = new FixedBitSet(other.length());
set.or(other);
for (int i = 0; i < set.length(); ++i) {
if (random().nextBoolean()) {
set.clear(i);
}
}
return set;
}
private static FixedBitSet intersect(FixedBitSet[] bitSets) {
final FixedBitSet intersection = new FixedBitSet(bitSets[0].length());
intersection.or(bitSets[0]);
for (int i = 1; i < bitSets.length; ++i) {
intersection.and(bitSets[i]);
}
return intersection;
}
private static FixedBitSet toBitSet(int maxDoc, DocIdSetIterator iterator) throws IOException {
final FixedBitSet set = new FixedBitSet(maxDoc);
for (int doc = iterator.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iterator.nextDoc()) {
set.set(doc);
}
return set;
}
// Test that the conjunction iterator is correct
public void testConjunction() throws IOException {
final int iters = atLeast(100);
for (int iter = 0; iter < iters; ++iter) {
final int maxDoc = TestUtil.nextInt(random(), 100, 10000);
final int numIterators = TestUtil.nextInt(random(), 2, 5);
final FixedBitSet[] sets = new FixedBitSet[numIterators];
final DocIdSetIterator[] iterators = new DocIdSetIterator[numIterators];
for (int i = 0; i < iterators.length; ++i) {
final FixedBitSet set = randomSet(maxDoc);
if (random().nextBoolean()) {
// simple iterator
sets[i] = set;
iterators[i] = new BitDocIdSet(set).iterator();
} else {
// scorer with approximation
final FixedBitSet confirmed = clearRandomBits(set);
sets[i] = confirmed;
final TwoPhaseDocIdSetIterator approximation = approximation(new BitDocIdSet(set).iterator(), confirmed);
iterators[i] = scorer(approximation);
}
}
final ConjunctionDISI conjunction = ConjunctionDISI.intersect(Arrays.asList(iterators));
assertEquals(intersect(sets), toBitSet(maxDoc, conjunction));
}
}
// Test that the conjunction approximation is correct
public void testConjunctionApproximation() throws IOException {
final int iters = atLeast(100);
for (int iter = 0; iter < iters; ++iter) {
final int maxDoc = TestUtil.nextInt(random(), 100, 10000);
final int numIterators = TestUtil.nextInt(random(), 2, 5);
final FixedBitSet[] sets = new FixedBitSet[numIterators];
final DocIdSetIterator[] iterators = new DocIdSetIterator[numIterators];
boolean hasApproximation = false;
for (int i = 0; i < iterators.length; ++i) {
final FixedBitSet set = randomSet(maxDoc);
if (random().nextBoolean()) {
// simple iterator
sets[i] = set;
iterators[i] = new BitDocIdSet(set).iterator();
} else {
// scorer with approximation
final FixedBitSet confirmed = clearRandomBits(set);
sets[i] = confirmed;
final TwoPhaseDocIdSetIterator approximation = approximation(new BitDocIdSet(set).iterator(), confirmed);
iterators[i] = scorer(approximation);
hasApproximation = true;
}
}
final ConjunctionDISI conjunction = ConjunctionDISI.intersect(Arrays.asList(iterators));
TwoPhaseDocIdSetIterator twoPhaseIterator = conjunction.asTwoPhaseIterator();
assertEquals(hasApproximation, twoPhaseIterator != null);
if (hasApproximation) {
assertEquals(intersect(sets), toBitSet(maxDoc, TwoPhaseDocIdSetIterator.asDocIdSetIterator(twoPhaseIterator)));
}
}
}
// This test makes sure that when nesting scorers with ConjunctionDISI, confirmations are pushed to the root.
public void testRecursiveConjunctionApproximation() throws IOException {
final int iters = atLeast(100);
for (int iter = 0; iter < iters; ++iter) {
final int maxDoc = TestUtil.nextInt(random(), 100, 10000);
final int numIterators = TestUtil.nextInt(random(), 2, 5);
final FixedBitSet[] sets = new FixedBitSet[numIterators];
DocIdSetIterator conjunction = null;
boolean hasApproximation = false;
for (int i = 0; i < numIterators; ++i) {
final FixedBitSet set = randomSet(maxDoc);
final DocIdSetIterator newIterator;
if (random().nextBoolean()) {
// simple iterator
sets[i] = set;
newIterator = new BitDocIdSet(set).iterator();
} else {
// scorer with approximation
final FixedBitSet confirmed = clearRandomBits(set);
sets[i] = confirmed;
final TwoPhaseDocIdSetIterator approximation = approximation(new BitDocIdSet(set).iterator(), confirmed);
newIterator = scorer(approximation);
hasApproximation = true;
}
if (conjunction == null) {
conjunction = newIterator;
} else {
final ConjunctionDISI conj = ConjunctionDISI.intersect(Arrays.asList(conjunction, newIterator));
conjunction = scorer(conj, conj.asTwoPhaseIterator());
}
}
TwoPhaseDocIdSetIterator twoPhaseIterator = ((Scorer) conjunction).asTwoPhaseIterator();
assertEquals(hasApproximation, twoPhaseIterator != null);
if (hasApproximation) {
assertEquals(intersect(sets), toBitSet(maxDoc, TwoPhaseDocIdSetIterator.asDocIdSetIterator(twoPhaseIterator)));
} else {
assertEquals(intersect(sets), toBitSet(maxDoc, conjunction));
}
}
}
}