mirror of https://github.com/apache/lucene.git
LUCENE-7330: Speed up conjunctions.
This commit is contained in:
parent
af2ae05d6e
commit
72914198e6
|
@ -40,6 +40,10 @@ Improvements
|
|||
IndexReader after (illegally) removing the old index and
|
||||
reindexing (Vitaly Funstein, Robert Muir, Mike McCandless)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-7330: Speed up conjunction queries. (Adrien Grand)
|
||||
|
||||
Other
|
||||
|
||||
* LUCENE-4787: Fixed some highlighting javadocs. (Michael Dodsworth via Adrien
|
||||
|
|
|
@ -31,11 +31,13 @@ import org.apache.lucene.util.CollectionUtil;
|
|||
* <br>Public only for use in {@link org.apache.lucene.search.spans}.
|
||||
* @lucene.internal
|
||||
*/
|
||||
public class ConjunctionDISI extends DocIdSetIterator {
|
||||
public final class ConjunctionDISI extends DocIdSetIterator {
|
||||
|
||||
/** Create a conjunction over the provided {@link Scorer}s, taking advantage
|
||||
* of {@link TwoPhaseIterator}. */
|
||||
public static ConjunctionDISI intersectScorers(List<Scorer> scorers) {
|
||||
/** Create a conjunction over the provided {@link Scorer}s. Note that the
|
||||
* returned {@link DocIdSetIterator} might leverage two-phase iteration in
|
||||
* which case it is possible to retrieve the {@link TwoPhaseIterator} using
|
||||
* {@link TwoPhaseIterator#unwrap}. */
|
||||
public static DocIdSetIterator intersectScorers(List<Scorer> scorers) {
|
||||
if (scorers.size() < 2) {
|
||||
throw new IllegalArgumentException("Cannot make a ConjunctionDISI of less than 2 iterators");
|
||||
}
|
||||
|
@ -45,15 +47,18 @@ public class ConjunctionDISI extends DocIdSetIterator {
|
|||
addScorer(scorer, allIterators, twoPhaseIterators);
|
||||
}
|
||||
|
||||
if (twoPhaseIterators.isEmpty()) {
|
||||
return new ConjunctionDISI(allIterators);
|
||||
} else {
|
||||
return new TwoPhase(allIterators, twoPhaseIterators);
|
||||
DocIdSetIterator iterator = new ConjunctionDISI(allIterators);
|
||||
if (twoPhaseIterators.isEmpty() == false) {
|
||||
iterator = TwoPhaseIterator.asDocIdSetIterator(new ConjunctionTwoPhaseIterator(iterator, twoPhaseIterators));
|
||||
}
|
||||
return iterator;
|
||||
}
|
||||
|
||||
/** Create a conjunction over the provided DocIdSetIterators. */
|
||||
public static ConjunctionDISI intersectIterators(List<DocIdSetIterator> iterators) {
|
||||
/** Create a conjunction over the provided DocIdSetIterators. Note that the
|
||||
* returned {@link DocIdSetIterator} might leverage two-phase iteration in
|
||||
* which case it is possible to retrieve the {@link TwoPhaseIterator} using
|
||||
* {@link TwoPhaseIterator#unwrap}. */
|
||||
public static DocIdSetIterator intersectIterators(List<DocIdSetIterator> iterators) {
|
||||
if (iterators.size() < 2) {
|
||||
throw new IllegalArgumentException("Cannot make a ConjunctionDISI of less than 2 iterators");
|
||||
}
|
||||
|
@ -63,16 +68,18 @@ public class ConjunctionDISI extends DocIdSetIterator {
|
|||
addIterator(iterator, allIterators, twoPhaseIterators);
|
||||
}
|
||||
|
||||
if (twoPhaseIterators.isEmpty()) {
|
||||
return new ConjunctionDISI(allIterators);
|
||||
} else {
|
||||
return new TwoPhase(allIterators, twoPhaseIterators);
|
||||
DocIdSetIterator iterator = new ConjunctionDISI(allIterators);
|
||||
if (twoPhaseIterators.isEmpty() == false) {
|
||||
iterator = TwoPhaseIterator.asDocIdSetIterator(new ConjunctionTwoPhaseIterator(iterator, twoPhaseIterators));
|
||||
}
|
||||
return iterator;
|
||||
}
|
||||
|
||||
/** Create a conjunction over the provided {@link Scorer}s, taking advantage
|
||||
* of {@link TwoPhaseIterator}. */
|
||||
public static ConjunctionDISI intersectSpans(List<Spans> spanList) {
|
||||
/** Create a conjunction over the provided {@link Spans}. Note that the
|
||||
* returned {@link DocIdSetIterator} might leverage two-phase iteration in
|
||||
* which case it is possible to retrieve the {@link TwoPhaseIterator} using
|
||||
* {@link TwoPhaseIterator#unwrap}. */
|
||||
public static DocIdSetIterator intersectSpans(List<Spans> spanList) {
|
||||
if (spanList.size() < 2) {
|
||||
throw new IllegalArgumentException("Cannot make a ConjunctionDISI of less than 2 iterators");
|
||||
}
|
||||
|
@ -82,11 +89,11 @@ public class ConjunctionDISI extends DocIdSetIterator {
|
|||
addSpans(spans, allIterators, twoPhaseIterators);
|
||||
}
|
||||
|
||||
if (twoPhaseIterators.isEmpty()) {
|
||||
return new ConjunctionDISI(allIterators);
|
||||
} else {
|
||||
return new TwoPhase(allIterators, twoPhaseIterators);
|
||||
DocIdSetIterator iterator = new ConjunctionDISI(allIterators);
|
||||
if (twoPhaseIterators.isEmpty() == false) {
|
||||
iterator = TwoPhaseIterator.asDocIdSetIterator(new ConjunctionTwoPhaseIterator(iterator, twoPhaseIterators));
|
||||
}
|
||||
return iterator;
|
||||
}
|
||||
|
||||
/** Adds the scorer, possibly splitting up into two phases or collapsing if it is another conjunction */
|
||||
|
@ -110,17 +117,16 @@ public class ConjunctionDISI extends DocIdSetIterator {
|
|||
}
|
||||
|
||||
private static void addIterator(DocIdSetIterator disi, List<DocIdSetIterator> allIterators, List<TwoPhaseIterator> twoPhaseIterators) {
|
||||
// Check for exactly this class for collapsing. Subclasses can do their own optimizations.
|
||||
if (disi.getClass() == ConjunctionDISI.class || disi.getClass() == TwoPhase.class) {
|
||||
TwoPhaseIterator twoPhase = TwoPhaseIterator.unwrap(disi);
|
||||
if (twoPhase != null) {
|
||||
addTwoPhaseIterator(twoPhase, allIterators, twoPhaseIterators);
|
||||
} else if (disi.getClass() == ConjunctionDISI.class) { // Check for exactly this class for collapsing
|
||||
ConjunctionDISI conjunction = (ConjunctionDISI) disi;
|
||||
// subconjuctions have already split themselves into two phase iterators and others, so we can take those
|
||||
// iterators as they are and move them up to this conjunction
|
||||
allIterators.add(conjunction.lead);
|
||||
allIterators.add(conjunction.lead1);
|
||||
allIterators.add(conjunction.lead2);
|
||||
Collections.addAll(allIterators, conjunction.others);
|
||||
if (conjunction.getClass() == TwoPhase.class) {
|
||||
TwoPhase twoPhase = (TwoPhase) conjunction;
|
||||
Collections.addAll(twoPhaseIterators, twoPhase.twoPhaseView.twoPhaseIterators);
|
||||
}
|
||||
} else {
|
||||
allIterators.add(disi);
|
||||
}
|
||||
|
@ -128,13 +134,17 @@ public class ConjunctionDISI extends DocIdSetIterator {
|
|||
|
||||
private static void addTwoPhaseIterator(TwoPhaseIterator twoPhaseIter, List<DocIdSetIterator> allIterators, List<TwoPhaseIterator> twoPhaseIterators) {
|
||||
addIterator(twoPhaseIter.approximation(), allIterators, twoPhaseIterators);
|
||||
twoPhaseIterators.add(twoPhaseIter);
|
||||
if (twoPhaseIter.getClass() == ConjunctionTwoPhaseIterator.class) { // Check for exactly this class for collapsing
|
||||
Collections.addAll(twoPhaseIterators, ((ConjunctionTwoPhaseIterator) twoPhaseIter).twoPhaseIterators);
|
||||
} else {
|
||||
twoPhaseIterators.add(twoPhaseIter);
|
||||
}
|
||||
}
|
||||
|
||||
final DocIdSetIterator lead;
|
||||
final DocIdSetIterator lead1, lead2;
|
||||
final DocIdSetIterator[] others;
|
||||
|
||||
ConjunctionDISI(List<? extends DocIdSetIterator> iterators) {
|
||||
private ConjunctionDISI(List<? extends DocIdSetIterator> iterators) {
|
||||
assert iterators.size() >= 2;
|
||||
// Sort the array the first time to allow the least frequent DocsEnum to
|
||||
// lead the matching.
|
||||
|
@ -144,84 +154,77 @@ public class ConjunctionDISI extends DocIdSetIterator {
|
|||
return Long.compare(o1.cost(), o2.cost());
|
||||
}
|
||||
});
|
||||
lead = iterators.get(0);
|
||||
others = iterators.subList(1, iterators.size()).toArray(new DocIdSetIterator[0]);
|
||||
}
|
||||
|
||||
protected boolean matches() throws IOException {
|
||||
return true;
|
||||
}
|
||||
|
||||
TwoPhaseIterator asTwoPhaseIterator() {
|
||||
return null;
|
||||
lead1 = iterators.get(0);
|
||||
lead2 = iterators.get(1);
|
||||
others = iterators.subList(2, iterators.size()).toArray(new DocIdSetIterator[0]);
|
||||
}
|
||||
|
||||
private int doNext(int doc) throws IOException {
|
||||
for(;;) {
|
||||
advanceHead: for(;;) {
|
||||
assert doc == lead1.docID();
|
||||
|
||||
if (doc == NO_MORE_DOCS) {
|
||||
// we need this check because it is only ok to call #matches when positioned
|
||||
return NO_MORE_DOCS;
|
||||
// find agreement between the two iterators with the lower costs
|
||||
// we special case them because they do not need the
|
||||
// 'other.docID() < doc' check that the 'others' iterators need
|
||||
final int next2 = lead2.advance(doc);
|
||||
if (next2 != doc) {
|
||||
doc = lead1.advance(next2);
|
||||
if (next2 != doc) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
advanceHead: for(;;) {
|
||||
for (DocIdSetIterator other : others) {
|
||||
// invariant: docsAndFreqs[i].doc <= doc at this point.
|
||||
// then find agreement with other iterators
|
||||
for (DocIdSetIterator other : others) {
|
||||
// other.doc may already be equal to doc if we "continued advanceHead"
|
||||
// on the previous iteration and the advance on the lead scorer exactly matched.
|
||||
if (other.docID() < doc) {
|
||||
final int next = other.advance(doc);
|
||||
|
||||
// docsAndFreqs[i].doc may already be equal to doc if we "broke advanceHead"
|
||||
// on the previous iteration and the advance on the lead scorer exactly matched.
|
||||
if (other.docID() < doc) {
|
||||
final int next = other.advance(doc);
|
||||
|
||||
if (next > doc) {
|
||||
// DocsEnum beyond the current doc - break and advance lead to the new highest doc.
|
||||
doc = lead.advance(next);
|
||||
break advanceHead;
|
||||
}
|
||||
if (next > doc) {
|
||||
// iterator beyond the current doc - advance lead and continue to the new highest doc.
|
||||
doc = lead1.advance(next);
|
||||
continue advanceHead;
|
||||
}
|
||||
}
|
||||
|
||||
if (matches()) {
|
||||
// success - all DocsEnums are on the same doc
|
||||
return doc;
|
||||
} else {
|
||||
doc = lead.nextDoc();
|
||||
break advanceHead;
|
||||
}
|
||||
}
|
||||
|
||||
// success - all iterators are on the same doc
|
||||
return doc;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
return doNext(lead.advance(target));
|
||||
return doNext(lead1.advance(target));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return lead.docID();
|
||||
return lead1.docID();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
return doNext(lead.nextDoc());
|
||||
return doNext(lead1.nextDoc());
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return lead.cost(); // overestimate
|
||||
return lead1.cost(); // overestimate
|
||||
}
|
||||
|
||||
/**
|
||||
* {@link TwoPhaseIterator} view of a {@link TwoPhase} conjunction.
|
||||
* {@link TwoPhaseIterator} implementing a conjunction.
|
||||
*/
|
||||
private static class TwoPhaseConjunctionDISI extends TwoPhaseIterator {
|
||||
private static final class ConjunctionTwoPhaseIterator extends TwoPhaseIterator {
|
||||
|
||||
private final TwoPhaseIterator[] twoPhaseIterators;
|
||||
private final float matchCost;
|
||||
|
||||
private TwoPhaseConjunctionDISI(List<? extends DocIdSetIterator> iterators, List<TwoPhaseIterator> twoPhaseIterators) {
|
||||
super(new ConjunctionDISI(iterators));
|
||||
private ConjunctionTwoPhaseIterator(DocIdSetIterator approximation,
|
||||
List<? extends TwoPhaseIterator> twoPhaseIterators) {
|
||||
super(approximation);
|
||||
assert twoPhaseIterators.size() > 0;
|
||||
|
||||
CollectionUtil.timSort(twoPhaseIterators, new Comparator<TwoPhaseIterator>() {
|
||||
|
@ -259,37 +262,4 @@ public class ConjunctionDISI extends DocIdSetIterator {
|
|||
|
||||
}
|
||||
|
||||
/**
|
||||
* A conjunction DISI built on top of approximations. This implementation
|
||||
* verifies that documents actually match by consulting the provided
|
||||
* {@link TwoPhaseIterator}s.
|
||||
*
|
||||
* Another important difference with {@link ConjunctionDISI} is that this
|
||||
* implementation supports approximations too: the approximation of this
|
||||
* impl is the conjunction of the approximations of the wrapped iterators.
|
||||
* This allows eg. {@code +"A B" +C} to be approximated as
|
||||
* {@code +(+A +B) +C}.
|
||||
*/
|
||||
// NOTE: this is essentially the same as TwoPhaseDocIdSetIterator.asDocIdSetIterator
|
||||
// but is its own impl in order to be able to expose a two-phase view
|
||||
private static class TwoPhase extends ConjunctionDISI {
|
||||
|
||||
final TwoPhaseConjunctionDISI twoPhaseView;
|
||||
|
||||
private TwoPhase(List<? extends DocIdSetIterator> iterators, List<TwoPhaseIterator> twoPhaseIterators) {
|
||||
super(iterators);
|
||||
twoPhaseView = new TwoPhaseConjunctionDISI(iterators, twoPhaseIterators);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TwoPhaseConjunctionDISI asTwoPhaseIterator() {
|
||||
return twoPhaseView;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean matches() throws IOException {
|
||||
return twoPhaseView.matches();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -25,7 +25,7 @@ import java.util.List;
|
|||
/** Scorer for conjunctions, sets of queries, all of which are required. */
|
||||
class ConjunctionScorer extends Scorer {
|
||||
|
||||
final ConjunctionDISI disi;
|
||||
final DocIdSetIterator disi;
|
||||
final Scorer[] scorers;
|
||||
final float coord;
|
||||
|
||||
|
@ -44,7 +44,7 @@ class ConjunctionScorer extends Scorer {
|
|||
|
||||
@Override
|
||||
public TwoPhaseIterator twoPhaseIterator() {
|
||||
return disi.asTwoPhaseIterator();
|
||||
return TwoPhaseIterator.unwrap(disi);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -37,7 +37,7 @@ final class ExactPhraseScorer extends Scorer {
|
|||
}
|
||||
}
|
||||
|
||||
private final ConjunctionDISI conjunction;
|
||||
private final DocIdSetIterator conjunction;
|
||||
private final PostingsAndPosition[] postings;
|
||||
|
||||
private int freq;
|
||||
|
@ -60,6 +60,7 @@ final class ExactPhraseScorer extends Scorer {
|
|||
postingsAndPositions.add(new PostingsAndPosition(posting.postings, posting.position));
|
||||
}
|
||||
conjunction = ConjunctionDISI.intersectIterators(iterators);
|
||||
assert TwoPhaseIterator.unwrap(conjunction) == null;
|
||||
this.postings = postingsAndPositions.toArray(new PostingsAndPosition[postingsAndPositions.size()]);
|
||||
this.matchCost = matchCost;
|
||||
}
|
||||
|
|
|
@ -31,7 +31,7 @@ import org.apache.lucene.util.FixedBitSet;
|
|||
|
||||
final class SloppyPhraseScorer extends Scorer {
|
||||
|
||||
private final ConjunctionDISI conjunction;
|
||||
private final DocIdSetIterator conjunction;
|
||||
private final PhrasePositions[] phrasePositions;
|
||||
|
||||
private float sloppyFreq; //phrase frequency in current doc as computed by phraseFreq().
|
||||
|
@ -70,6 +70,7 @@ final class SloppyPhraseScorer extends Scorer {
|
|||
phrasePositions[i] = new PhrasePositions(postings[i].postings, postings[i].position, i, postings[i].terms);
|
||||
}
|
||||
conjunction = ConjunctionDISI.intersectIterators(Arrays.asList(iterators));
|
||||
assert TwoPhaseIterator.unwrap(conjunction) == null;
|
||||
this.matchCost = matchCost;
|
||||
}
|
||||
|
||||
|
|
|
@ -41,40 +41,61 @@ public abstract class TwoPhaseIterator {
|
|||
/** Return a {@link DocIdSetIterator} view of the provided
|
||||
* {@link TwoPhaseIterator}. */
|
||||
public static DocIdSetIterator asDocIdSetIterator(TwoPhaseIterator twoPhaseIterator) {
|
||||
final DocIdSetIterator approximation = twoPhaseIterator.approximation();
|
||||
return new DocIdSetIterator() {
|
||||
return new TwoPhaseIteratorAsDocIdSetIterator(twoPhaseIterator);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return approximation.docID();
|
||||
}
|
||||
/**
|
||||
* If the given {@link DocIdSetIterator} has been created with
|
||||
* {@link #asDocIdSetIterator}, then this will return the wrapped
|
||||
* {@link TwoPhaseIterator}. Otherwise this returns {@code null}.
|
||||
*/
|
||||
public static TwoPhaseIterator unwrap(DocIdSetIterator iterator) {
|
||||
if (iterator instanceof TwoPhaseIteratorAsDocIdSetIterator) {
|
||||
return ((TwoPhaseIteratorAsDocIdSetIterator) iterator).twoPhaseIterator;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
return doNext(approximation.nextDoc());
|
||||
}
|
||||
private static class TwoPhaseIteratorAsDocIdSetIterator extends DocIdSetIterator {
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
return doNext(approximation.advance(target));
|
||||
}
|
||||
final TwoPhaseIterator twoPhaseIterator;
|
||||
final DocIdSetIterator approximation;
|
||||
|
||||
private int doNext(int doc) throws IOException {
|
||||
for (;; doc = approximation.nextDoc()) {
|
||||
if (doc == NO_MORE_DOCS) {
|
||||
return NO_MORE_DOCS;
|
||||
} else if (twoPhaseIterator.matches()) {
|
||||
return doc;
|
||||
}
|
||||
TwoPhaseIteratorAsDocIdSetIterator(TwoPhaseIterator twoPhaseIterator) {
|
||||
this.twoPhaseIterator = twoPhaseIterator;
|
||||
this.approximation = twoPhaseIterator.approximation;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return approximation.docID();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
return doNext(approximation.nextDoc());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
return doNext(approximation.advance(target));
|
||||
}
|
||||
|
||||
private int doNext(int doc) throws IOException {
|
||||
for (;; doc = approximation.nextDoc()) {
|
||||
if (doc == NO_MORE_DOCS) {
|
||||
return NO_MORE_DOCS;
|
||||
} else if (twoPhaseIterator.matches()) {
|
||||
return doc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return approximation.cost();
|
||||
}
|
||||
|
||||
};
|
||||
@Override
|
||||
public long cost() {
|
||||
return approximation.cost();
|
||||
}
|
||||
}
|
||||
|
||||
/** Return an approximation. The returned {@link DocIdSetIterator} is a
|
||||
|
|
|
@ -181,7 +181,7 @@ public class TestConjunctionDISI extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
final ConjunctionDISI conjunction = ConjunctionDISI.intersectScorers(Arrays.asList(iterators));
|
||||
final DocIdSetIterator conjunction = ConjunctionDISI.intersectScorers(Arrays.asList(iterators));
|
||||
assertEquals(intersect(sets), toBitSet(maxDoc, conjunction));
|
||||
}
|
||||
}
|
||||
|
@ -211,8 +211,8 @@ public class TestConjunctionDISI extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
final ConjunctionDISI conjunction = ConjunctionDISI.intersectScorers(Arrays.asList(iterators));
|
||||
TwoPhaseIterator twoPhaseIterator = conjunction.asTwoPhaseIterator();
|
||||
final DocIdSetIterator conjunction = ConjunctionDISI.intersectScorers(Arrays.asList(iterators));
|
||||
TwoPhaseIterator twoPhaseIterator = TwoPhaseIterator.unwrap(conjunction);
|
||||
assertEquals(hasApproximation, twoPhaseIterator != null);
|
||||
if (hasApproximation) {
|
||||
assertEquals(intersect(sets), toBitSet(maxDoc, TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator)));
|
||||
|
@ -248,8 +248,8 @@ public class TestConjunctionDISI extends LuceneTestCase {
|
|||
if (conjunction == null) {
|
||||
conjunction = newIterator;
|
||||
} else {
|
||||
final ConjunctionDISI conj = ConjunctionDISI.intersectScorers(Arrays.asList(conjunction, newIterator));
|
||||
conjunction = scorer(conj, conj.asTwoPhaseIterator());
|
||||
final DocIdSetIterator conj = ConjunctionDISI.intersectScorers(Arrays.asList(conjunction, newIterator));
|
||||
conjunction = scorer(conj, TwoPhaseIterator.unwrap(conj));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -309,7 +309,7 @@ public class TestConjunctionDISI extends LuceneTestCase {
|
|||
}
|
||||
|
||||
|
||||
final ConjunctionDISI conjunction = ConjunctionDISI.intersectScorers(scorers);
|
||||
final DocIdSetIterator conjunction = ConjunctionDISI.intersectScorers(scorers);
|
||||
assertEquals(intersect(sets), toBitSet(maxDoc, conjunction));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue