diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 5a83c312ef6..3c524401152 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -40,6 +40,10 @@ Improvements IndexReader after (illegally) removing the old index and reindexing (Vitaly Funstein, Robert Muir, Mike McCandless) +Optimizations + +* LUCENE-7330: Speed up conjunction queries. (Adrien Grand) + Other * LUCENE-4787: Fixed some highlighting javadocs. (Michael Dodsworth via Adrien diff --git a/lucene/core/src/java/org/apache/lucene/search/ConjunctionDISI.java b/lucene/core/src/java/org/apache/lucene/search/ConjunctionDISI.java index 53e3753436a..205a349b73b 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ConjunctionDISI.java +++ b/lucene/core/src/java/org/apache/lucene/search/ConjunctionDISI.java @@ -31,11 +31,13 @@ import org.apache.lucene.util.CollectionUtil; *
Public only for use in {@link org.apache.lucene.search.spans}. * @lucene.internal */ -public class ConjunctionDISI extends DocIdSetIterator { +public final class ConjunctionDISI extends DocIdSetIterator { - /** Create a conjunction over the provided {@link Scorer}s, taking advantage - * of {@link TwoPhaseIterator}. */ - public static ConjunctionDISI intersectScorers(List scorers) { + /** Create a conjunction over the provided {@link Scorer}s. Note that the + * returned {@link DocIdSetIterator} might leverage two-phase iteration in + * which case it is possible to retrieve the {@link TwoPhaseIterator} using + * {@link TwoPhaseIterator#unwrap}. */ + public static DocIdSetIterator intersectScorers(List scorers) { if (scorers.size() < 2) { throw new IllegalArgumentException("Cannot make a ConjunctionDISI of less than 2 iterators"); } @@ -45,15 +47,18 @@ public class ConjunctionDISI extends DocIdSetIterator { addScorer(scorer, allIterators, twoPhaseIterators); } - if (twoPhaseIterators.isEmpty()) { - return new ConjunctionDISI(allIterators); - } else { - return new TwoPhase(allIterators, twoPhaseIterators); + DocIdSetIterator iterator = new ConjunctionDISI(allIterators); + if (twoPhaseIterators.isEmpty() == false) { + iterator = TwoPhaseIterator.asDocIdSetIterator(new ConjunctionTwoPhaseIterator(iterator, twoPhaseIterators)); } + return iterator; } - /** Create a conjunction over the provided DocIdSetIterators. */ - public static ConjunctionDISI intersectIterators(List iterators) { + /** Create a conjunction over the provided DocIdSetIterators. Note that the + * returned {@link DocIdSetIterator} might leverage two-phase iteration in + * which case it is possible to retrieve the {@link TwoPhaseIterator} using + * {@link TwoPhaseIterator#unwrap}. */ + public static DocIdSetIterator intersectIterators(List iterators) { if (iterators.size() < 2) { throw new IllegalArgumentException("Cannot make a ConjunctionDISI of less than 2 iterators"); } @@ -63,16 +68,18 @@ public class ConjunctionDISI extends DocIdSetIterator { addIterator(iterator, allIterators, twoPhaseIterators); } - if (twoPhaseIterators.isEmpty()) { - return new ConjunctionDISI(allIterators); - } else { - return new TwoPhase(allIterators, twoPhaseIterators); + DocIdSetIterator iterator = new ConjunctionDISI(allIterators); + if (twoPhaseIterators.isEmpty() == false) { + iterator = TwoPhaseIterator.asDocIdSetIterator(new ConjunctionTwoPhaseIterator(iterator, twoPhaseIterators)); } + return iterator; } - /** Create a conjunction over the provided {@link Scorer}s, taking advantage - * of {@link TwoPhaseIterator}. */ - public static ConjunctionDISI intersectSpans(List spanList) { + /** Create a conjunction over the provided {@link Spans}. Note that the + * returned {@link DocIdSetIterator} might leverage two-phase iteration in + * which case it is possible to retrieve the {@link TwoPhaseIterator} using + * {@link TwoPhaseIterator#unwrap}. */ + public static DocIdSetIterator intersectSpans(List spanList) { if (spanList.size() < 2) { throw new IllegalArgumentException("Cannot make a ConjunctionDISI of less than 2 iterators"); } @@ -82,11 +89,11 @@ public class ConjunctionDISI extends DocIdSetIterator { addSpans(spans, allIterators, twoPhaseIterators); } - if (twoPhaseIterators.isEmpty()) { - return new ConjunctionDISI(allIterators); - } else { - return new TwoPhase(allIterators, twoPhaseIterators); + DocIdSetIterator iterator = new ConjunctionDISI(allIterators); + if (twoPhaseIterators.isEmpty() == false) { + iterator = TwoPhaseIterator.asDocIdSetIterator(new ConjunctionTwoPhaseIterator(iterator, twoPhaseIterators)); } + return iterator; } /** Adds the scorer, possibly splitting up into two phases or collapsing if it is another conjunction */ @@ -110,17 +117,16 @@ public class ConjunctionDISI extends DocIdSetIterator { } private static void addIterator(DocIdSetIterator disi, List allIterators, List twoPhaseIterators) { - // Check for exactly this class for collapsing. Subclasses can do their own optimizations. - if (disi.getClass() == ConjunctionDISI.class || disi.getClass() == TwoPhase.class) { + TwoPhaseIterator twoPhase = TwoPhaseIterator.unwrap(disi); + if (twoPhase != null) { + addTwoPhaseIterator(twoPhase, allIterators, twoPhaseIterators); + } else if (disi.getClass() == ConjunctionDISI.class) { // Check for exactly this class for collapsing ConjunctionDISI conjunction = (ConjunctionDISI) disi; // subconjuctions have already split themselves into two phase iterators and others, so we can take those // iterators as they are and move them up to this conjunction - allIterators.add(conjunction.lead); + allIterators.add(conjunction.lead1); + allIterators.add(conjunction.lead2); Collections.addAll(allIterators, conjunction.others); - if (conjunction.getClass() == TwoPhase.class) { - TwoPhase twoPhase = (TwoPhase) conjunction; - Collections.addAll(twoPhaseIterators, twoPhase.twoPhaseView.twoPhaseIterators); - } } else { allIterators.add(disi); } @@ -128,13 +134,17 @@ public class ConjunctionDISI extends DocIdSetIterator { private static void addTwoPhaseIterator(TwoPhaseIterator twoPhaseIter, List allIterators, List twoPhaseIterators) { addIterator(twoPhaseIter.approximation(), allIterators, twoPhaseIterators); - twoPhaseIterators.add(twoPhaseIter); + if (twoPhaseIter.getClass() == ConjunctionTwoPhaseIterator.class) { // Check for exactly this class for collapsing + Collections.addAll(twoPhaseIterators, ((ConjunctionTwoPhaseIterator) twoPhaseIter).twoPhaseIterators); + } else { + twoPhaseIterators.add(twoPhaseIter); + } } - final DocIdSetIterator lead; + final DocIdSetIterator lead1, lead2; final DocIdSetIterator[] others; - ConjunctionDISI(List iterators) { + private ConjunctionDISI(List iterators) { assert iterators.size() >= 2; // Sort the array the first time to allow the least frequent DocsEnum to // lead the matching. @@ -144,84 +154,77 @@ public class ConjunctionDISI extends DocIdSetIterator { return Long.compare(o1.cost(), o2.cost()); } }); - lead = iterators.get(0); - others = iterators.subList(1, iterators.size()).toArray(new DocIdSetIterator[0]); - } - - protected boolean matches() throws IOException { - return true; - } - - TwoPhaseIterator asTwoPhaseIterator() { - return null; + lead1 = iterators.get(0); + lead2 = iterators.get(1); + others = iterators.subList(2, iterators.size()).toArray(new DocIdSetIterator[0]); } private int doNext(int doc) throws IOException { - for(;;) { + advanceHead: for(;;) { + assert doc == lead1.docID(); - if (doc == NO_MORE_DOCS) { - // we need this check because it is only ok to call #matches when positioned - return NO_MORE_DOCS; + // find agreement between the two iterators with the lower costs + // we special case them because they do not need the + // 'other.docID() < doc' check that the 'others' iterators need + final int next2 = lead2.advance(doc); + if (next2 != doc) { + doc = lead1.advance(next2); + if (next2 != doc) { + continue; + } } - advanceHead: for(;;) { - for (DocIdSetIterator other : others) { - // invariant: docsAndFreqs[i].doc <= doc at this point. + // then find agreement with other iterators + for (DocIdSetIterator other : others) { + // other.doc may already be equal to doc if we "continued advanceHead" + // on the previous iteration and the advance on the lead scorer exactly matched. + if (other.docID() < doc) { + final int next = other.advance(doc); - // docsAndFreqs[i].doc may already be equal to doc if we "broke advanceHead" - // on the previous iteration and the advance on the lead scorer exactly matched. - if (other.docID() < doc) { - final int next = other.advance(doc); - - if (next > doc) { - // DocsEnum beyond the current doc - break and advance lead to the new highest doc. - doc = lead.advance(next); - break advanceHead; - } + if (next > doc) { + // iterator beyond the current doc - advance lead and continue to the new highest doc. + doc = lead1.advance(next); + continue advanceHead; } } - - if (matches()) { - // success - all DocsEnums are on the same doc - return doc; - } else { - doc = lead.nextDoc(); - break advanceHead; - } } + + // success - all iterators are on the same doc + return doc; } } @Override public int advance(int target) throws IOException { - return doNext(lead.advance(target)); + return doNext(lead1.advance(target)); } @Override public int docID() { - return lead.docID(); + return lead1.docID(); } @Override public int nextDoc() throws IOException { - return doNext(lead.nextDoc()); + return doNext(lead1.nextDoc()); } @Override public long cost() { - return lead.cost(); // overestimate + return lead1.cost(); // overestimate } /** - * {@link TwoPhaseIterator} view of a {@link TwoPhase} conjunction. + * {@link TwoPhaseIterator} implementing a conjunction. */ - private static class TwoPhaseConjunctionDISI extends TwoPhaseIterator { + private static final class ConjunctionTwoPhaseIterator extends TwoPhaseIterator { private final TwoPhaseIterator[] twoPhaseIterators; private final float matchCost; - private TwoPhaseConjunctionDISI(List iterators, List twoPhaseIterators) { - super(new ConjunctionDISI(iterators)); + private ConjunctionTwoPhaseIterator(DocIdSetIterator approximation, + List twoPhaseIterators) { + super(approximation); assert twoPhaseIterators.size() > 0; CollectionUtil.timSort(twoPhaseIterators, new Comparator() { @@ -259,37 +262,4 @@ public class ConjunctionDISI extends DocIdSetIterator { } - /** - * A conjunction DISI built on top of approximations. This implementation - * verifies that documents actually match by consulting the provided - * {@link TwoPhaseIterator}s. - * - * Another important difference with {@link ConjunctionDISI} is that this - * implementation supports approximations too: the approximation of this - * impl is the conjunction of the approximations of the wrapped iterators. - * This allows eg. {@code +"A B" +C} to be approximated as - * {@code +(+A +B) +C}. - */ - // NOTE: this is essentially the same as TwoPhaseDocIdSetIterator.asDocIdSetIterator - // but is its own impl in order to be able to expose a two-phase view - private static class TwoPhase extends ConjunctionDISI { - - final TwoPhaseConjunctionDISI twoPhaseView; - - private TwoPhase(List iterators, List twoPhaseIterators) { - super(iterators); - twoPhaseView = new TwoPhaseConjunctionDISI(iterators, twoPhaseIterators); - } - - @Override - public TwoPhaseConjunctionDISI asTwoPhaseIterator() { - return twoPhaseView; - } - - @Override - protected boolean matches() throws IOException { - return twoPhaseView.matches(); - } - } - } diff --git a/lucene/core/src/java/org/apache/lucene/search/ConjunctionScorer.java b/lucene/core/src/java/org/apache/lucene/search/ConjunctionScorer.java index 066f07c9e99..ab1de92475b 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ConjunctionScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/ConjunctionScorer.java @@ -25,7 +25,7 @@ import java.util.List; /** Scorer for conjunctions, sets of queries, all of which are required. */ class ConjunctionScorer extends Scorer { - final ConjunctionDISI disi; + final DocIdSetIterator disi; final Scorer[] scorers; final float coord; @@ -44,7 +44,7 @@ class ConjunctionScorer extends Scorer { @Override public TwoPhaseIterator twoPhaseIterator() { - return disi.asTwoPhaseIterator(); + return TwoPhaseIterator.unwrap(disi); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java b/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java index c3c8b713c35..f5ccc7c294b 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java @@ -37,7 +37,7 @@ final class ExactPhraseScorer extends Scorer { } } - private final ConjunctionDISI conjunction; + private final DocIdSetIterator conjunction; private final PostingsAndPosition[] postings; private int freq; @@ -60,6 +60,7 @@ final class ExactPhraseScorer extends Scorer { postingsAndPositions.add(new PostingsAndPosition(posting.postings, posting.position)); } conjunction = ConjunctionDISI.intersectIterators(iterators); + assert TwoPhaseIterator.unwrap(conjunction) == null; this.postings = postingsAndPositions.toArray(new PostingsAndPosition[postingsAndPositions.size()]); this.matchCost = matchCost; } diff --git a/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java b/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java index 2ee3e70d21c..acc82820172 100644 --- a/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java @@ -31,7 +31,7 @@ import org.apache.lucene.util.FixedBitSet; final class SloppyPhraseScorer extends Scorer { - private final ConjunctionDISI conjunction; + private final DocIdSetIterator conjunction; private final PhrasePositions[] phrasePositions; private float sloppyFreq; //phrase frequency in current doc as computed by phraseFreq(). @@ -70,6 +70,7 @@ final class SloppyPhraseScorer extends Scorer { phrasePositions[i] = new PhrasePositions(postings[i].postings, postings[i].position, i, postings[i].terms); } conjunction = ConjunctionDISI.intersectIterators(Arrays.asList(iterators)); + assert TwoPhaseIterator.unwrap(conjunction) == null; this.matchCost = matchCost; } diff --git a/lucene/core/src/java/org/apache/lucene/search/TwoPhaseIterator.java b/lucene/core/src/java/org/apache/lucene/search/TwoPhaseIterator.java index 38176bfdb79..ff472e7632f 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TwoPhaseIterator.java +++ b/lucene/core/src/java/org/apache/lucene/search/TwoPhaseIterator.java @@ -41,40 +41,61 @@ public abstract class TwoPhaseIterator { /** Return a {@link DocIdSetIterator} view of the provided * {@link TwoPhaseIterator}. */ public static DocIdSetIterator asDocIdSetIterator(TwoPhaseIterator twoPhaseIterator) { - final DocIdSetIterator approximation = twoPhaseIterator.approximation(); - return new DocIdSetIterator() { + return new TwoPhaseIteratorAsDocIdSetIterator(twoPhaseIterator); + } - @Override - public int docID() { - return approximation.docID(); - } + /** + * If the given {@link DocIdSetIterator} has been created with + * {@link #asDocIdSetIterator}, then this will return the wrapped + * {@link TwoPhaseIterator}. Otherwise this returns {@code null}. + */ + public static TwoPhaseIterator unwrap(DocIdSetIterator iterator) { + if (iterator instanceof TwoPhaseIteratorAsDocIdSetIterator) { + return ((TwoPhaseIteratorAsDocIdSetIterator) iterator).twoPhaseIterator; + } else { + return null; + } + } - @Override - public int nextDoc() throws IOException { - return doNext(approximation.nextDoc()); - } + private static class TwoPhaseIteratorAsDocIdSetIterator extends DocIdSetIterator { - @Override - public int advance(int target) throws IOException { - return doNext(approximation.advance(target)); - } + final TwoPhaseIterator twoPhaseIterator; + final DocIdSetIterator approximation; - private int doNext(int doc) throws IOException { - for (;; doc = approximation.nextDoc()) { - if (doc == NO_MORE_DOCS) { - return NO_MORE_DOCS; - } else if (twoPhaseIterator.matches()) { - return doc; - } + TwoPhaseIteratorAsDocIdSetIterator(TwoPhaseIterator twoPhaseIterator) { + this.twoPhaseIterator = twoPhaseIterator; + this.approximation = twoPhaseIterator.approximation; + } + + @Override + public int docID() { + return approximation.docID(); + } + + @Override + public int nextDoc() throws IOException { + return doNext(approximation.nextDoc()); + } + + @Override + public int advance(int target) throws IOException { + return doNext(approximation.advance(target)); + } + + private int doNext(int doc) throws IOException { + for (;; doc = approximation.nextDoc()) { + if (doc == NO_MORE_DOCS) { + return NO_MORE_DOCS; + } else if (twoPhaseIterator.matches()) { + return doc; } } + } - @Override - public long cost() { - return approximation.cost(); - } - - }; + @Override + public long cost() { + return approximation.cost(); + } } /** Return an approximation. The returned {@link DocIdSetIterator} is a diff --git a/lucene/core/src/test/org/apache/lucene/search/TestConjunctionDISI.java b/lucene/core/src/test/org/apache/lucene/search/TestConjunctionDISI.java index 269990d3398..dcb666417c7 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestConjunctionDISI.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestConjunctionDISI.java @@ -181,7 +181,7 @@ public class TestConjunctionDISI extends LuceneTestCase { } } - final ConjunctionDISI conjunction = ConjunctionDISI.intersectScorers(Arrays.asList(iterators)); + final DocIdSetIterator conjunction = ConjunctionDISI.intersectScorers(Arrays.asList(iterators)); assertEquals(intersect(sets), toBitSet(maxDoc, conjunction)); } } @@ -211,8 +211,8 @@ public class TestConjunctionDISI extends LuceneTestCase { } } - final ConjunctionDISI conjunction = ConjunctionDISI.intersectScorers(Arrays.asList(iterators)); - TwoPhaseIterator twoPhaseIterator = conjunction.asTwoPhaseIterator(); + final DocIdSetIterator conjunction = ConjunctionDISI.intersectScorers(Arrays.asList(iterators)); + TwoPhaseIterator twoPhaseIterator = TwoPhaseIterator.unwrap(conjunction); assertEquals(hasApproximation, twoPhaseIterator != null); if (hasApproximation) { assertEquals(intersect(sets), toBitSet(maxDoc, TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator))); @@ -248,8 +248,8 @@ public class TestConjunctionDISI extends LuceneTestCase { if (conjunction == null) { conjunction = newIterator; } else { - final ConjunctionDISI conj = ConjunctionDISI.intersectScorers(Arrays.asList(conjunction, newIterator)); - conjunction = scorer(conj, conj.asTwoPhaseIterator()); + final DocIdSetIterator conj = ConjunctionDISI.intersectScorers(Arrays.asList(conjunction, newIterator)); + conjunction = scorer(conj, TwoPhaseIterator.unwrap(conj)); } } @@ -309,7 +309,7 @@ public class TestConjunctionDISI extends LuceneTestCase { } - final ConjunctionDISI conjunction = ConjunctionDISI.intersectScorers(scorers); + final DocIdSetIterator conjunction = ConjunctionDISI.intersectScorers(scorers); assertEquals(intersect(sets), toBitSet(maxDoc, conjunction)); } }