From 9d5dc0cf573cf8fc75dd7799844db2cb0fa52da8 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Mon, 30 Jan 2017 10:46:22 +0100 Subject: [PATCH] LUCENE-7654: To-parent block joins should implement two-phase iteration. --- lucene/CHANGES.txt | 4 + .../search/join/ToParentBlockJoinQuery.java | 338 ++++++++++-------- .../lucene/search/join/TestBlockJoin.java | 5 +- .../search/join/TestBlockJoinValidation.java | 19 - 4 files changed, 190 insertions(+), 176 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 6a0d17ddeaa..17a528e78ff 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -135,6 +135,10 @@ Optimizations whether BKD cells are entirely within the distance close to the dateline. (Adrien Grand) +* LUCENE-7654: ToParentBlockJoinQuery now implements two-phase iteration and + computes scores lazily in order to be faster when used in conjunctions. + (Adrien Grand) + Build * LUCENE-7651: Fix Javadocs build for Java 8u121 by injecting "Google Code diff --git a/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinQuery.java b/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinQuery.java index 6369eea0de9..28374233bd0 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinQuery.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinQuery.java @@ -20,15 +20,18 @@ import java.io.IOException; import java.util.Collection; import java.util.Collections; import java.util.Locale; + import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.search.FilterWeight; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.FilterWeight; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.ScorerSupplier; +import org.apache.lucene.search.TwoPhaseIterator; import org.apache.lucene.search.Weight; import org.apache.lucene.util.BitSet; @@ -74,7 +77,7 @@ public class ToParentBlockJoinQuery extends Query { private final ScoreMode scoreMode; /** Create a ToParentBlockJoinQuery. - * + * * @param childQuery Query matching child documents. * @param parentsFilter Filter identifying the parent documents. * @param scoreMode How to aggregate multiple child scores @@ -100,7 +103,7 @@ public class ToParentBlockJoinQuery extends Query { public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { return new BlockJoinWeight(this, childQuery.createWeight(searcher, needsScores, boost), parentsFilter, needsScores ? scoreMode : ScoreMode.None); } - + /** Return our child query. */ public Query getChildQuery() { return childQuery; @@ -116,33 +119,44 @@ public class ToParentBlockJoinQuery extends Query { this.scoreMode = scoreMode; } + @Override + public Scorer scorer(LeafReaderContext context) throws IOException { + final ScorerSupplier scorerSupplier = scorerSupplier(context); + if (scorerSupplier == null) { + return null; + } + return scorerSupplier.get(false); + } + // NOTE: acceptDocs applies (and is checked) only in the // parent document space @Override - public Scorer scorer(LeafReaderContext readerContext) throws IOException { - - final Scorer childScorer = in.scorer(readerContext); - if (childScorer == null) { - // No matches - return null; - } - - final int firstChildDoc = childScorer.iterator().nextDoc(); - if (firstChildDoc == DocIdSetIterator.NO_MORE_DOCS) { - // No matches + public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException { + final ScorerSupplier childScorerSupplier = in.scorerSupplier(context); + if (childScorerSupplier == null) { return null; } // NOTE: this does not take accept docs into account, the responsibility // to not match deleted docs is on the scorer - final BitSet parents = parentsFilter.getBitSet(readerContext); - + final BitSet parents = parentsFilter.getBitSet(context); if (parents == null) { // No matches return null; } - return new BlockJoinScorer(this, childScorer, parents, firstChildDoc, scoreMode); + return new ScorerSupplier() { + + @Override + public Scorer get(boolean randomAccess) throws IOException { + return new BlockJoinScorer(BlockJoinWeight.this, childScorerSupplier.get(randomAccess), parents, scoreMode); + } + + @Override + public long cost() { + return childScorerSupplier.cost(); + } + }; } @Override @@ -154,25 +168,108 @@ public class ToParentBlockJoinQuery extends Query { return Explanation.noMatch("Not a match"); } } - + + private static class ParentApproximation extends DocIdSetIterator { + + private final DocIdSetIterator childApproximation; + private final BitSet parentBits; + private int doc = -1; + + ParentApproximation(DocIdSetIterator childApproximation, BitSet parentBits) { + this.childApproximation = childApproximation; + this.parentBits = parentBits; + } + + @Override + public int docID() { + return doc; + } + + @Override + public int nextDoc() throws IOException { + return advance(doc + 1); + } + + @Override + public int advance(int target) throws IOException { + if (target >= parentBits.length()) { + return doc = NO_MORE_DOCS; + } + final int firstChildTarget = target == 0 ? 0 : parentBits.prevSetBit(target - 1) + 1; + int childDoc = childApproximation.docID(); + if (childDoc < firstChildTarget) { + childDoc = childApproximation.advance(firstChildTarget); + } + if (childDoc >= parentBits.length() - 1) { + return doc = NO_MORE_DOCS; + } + return doc = parentBits.nextSetBit(childDoc + 1); + } + + @Override + public long cost() { + return childApproximation.cost(); + } + } + + private static class ParentTwoPhase extends TwoPhaseIterator { + + private final ParentApproximation parentApproximation; + private final DocIdSetIterator childApproximation; + private final TwoPhaseIterator childTwoPhase; + + ParentTwoPhase(ParentApproximation parentApproximation, TwoPhaseIterator childTwoPhase) { + super(parentApproximation); + this.parentApproximation = parentApproximation; + this.childApproximation = childTwoPhase.approximation(); + this.childTwoPhase = childTwoPhase; + } + + @Override + public boolean matches() throws IOException { + assert childApproximation.docID() < parentApproximation.docID(); + do { + if (childTwoPhase.matches()) { + return true; + } + } while (childApproximation.nextDoc() < parentApproximation.docID()); + return false; + } + + @Override + public float matchCost() { + // TODO: how could we compute a match cost? + return childTwoPhase.matchCost() + 10; + } + } + static class BlockJoinScorer extends Scorer { private final Scorer childScorer; private final BitSet parentBits; private final ScoreMode scoreMode; - private int parentDoc = -1; - private int prevParentDoc; - private float parentScore; - private int parentFreq; - private int nextChildDoc; - private int childDocUpto; + private final DocIdSetIterator childApproximation; + private final TwoPhaseIterator childTwoPhase; + private final ParentApproximation parentApproximation; + private final ParentTwoPhase parentTwoPhase; + private float score; + private int freq; - public BlockJoinScorer(Weight weight, Scorer childScorer, BitSet parentBits, int firstChildDoc, ScoreMode scoreMode) { + public BlockJoinScorer(Weight weight, Scorer childScorer, BitSet parentBits, ScoreMode scoreMode) { super(weight); //System.out.println("Q.init firstChildDoc=" + firstChildDoc); this.parentBits = parentBits; this.childScorer = childScorer; this.scoreMode = scoreMode; - nextChildDoc = firstChildDoc; + childTwoPhase = childScorer.twoPhaseIterator(); + if (childTwoPhase == null) { + childApproximation = childScorer.iterator(); + parentApproximation = new ParentApproximation(childApproximation, parentBits); + parentTwoPhase = null; + } else { + childApproximation = childTwoPhase.approximation(); + parentApproximation = new ParentApproximation(childTwoPhase.approximation(), parentBits); + parentTwoPhase = new ParentTwoPhase(parentApproximation, childTwoPhase); + } } @Override @@ -182,152 +279,80 @@ public class ToParentBlockJoinQuery extends Query { @Override public DocIdSetIterator iterator() { - return new DocIdSetIterator() { - final DocIdSetIterator childIt = childScorer.iterator(); - - @Override - public int nextDoc() throws IOException { - //System.out.println("Q.nextDoc() nextChildDoc=" + nextChildDoc); - if (nextChildDoc == NO_MORE_DOCS) { - //System.out.println(" end"); - return parentDoc = NO_MORE_DOCS; - } - - // Gather all children sharing the same parent as - // nextChildDoc - - parentDoc = parentBits.nextSetBit(nextChildDoc); - - // Parent & child docs are supposed to be - // orthogonal: - checkOrthogonal(nextChildDoc, parentDoc); - - //System.out.println(" parentDoc=" + parentDoc); - assert parentDoc != DocIdSetIterator.NO_MORE_DOCS; - - float totalScore = 0; - float maxScore = Float.NEGATIVE_INFINITY; - float minScore = Float.POSITIVE_INFINITY; - - childDocUpto = 0; - parentFreq = 0; - do { - - //System.out.println(" c=" + nextChildDoc); - if (scoreMode != ScoreMode.None) { - // TODO: specialize this into dedicated classes per-scoreMode - final float childScore = childScorer.score(); - final int childFreq = childScorer.freq(); - maxScore = Math.max(childScore, maxScore); - minScore = Math.min(childScore, minScore); - totalScore += childScore; - parentFreq += childFreq; - } - childDocUpto++; - nextChildDoc = childIt.nextDoc(); - } while (nextChildDoc < parentDoc); - - // Parent & child docs are supposed to be - // orthogonal: - checkOrthogonal(nextChildDoc, parentDoc); - - switch(scoreMode) { - case Avg: - parentScore = totalScore / childDocUpto; - break; - case Max: - parentScore = maxScore; - break; - case Min: - parentScore = minScore; - break; - case Total: - parentScore = totalScore; - break; - case None: - break; - } - - //System.out.println(" return parentDoc=" + parentDoc + " childDocUpto=" + childDocUpto); - return parentDoc; - } - - @Override - public int advance(int parentTarget) throws IOException { - - //System.out.println("Q.advance parentTarget=" + parentTarget); - if (parentTarget == NO_MORE_DOCS) { - return parentDoc = NO_MORE_DOCS; - } - - if (parentTarget == 0) { - // Callers should only be passing in a docID from - // the parent space, so this means this parent - // has no children (it got docID 0), so it cannot - // possibly match. We must handle this case - // separately otherwise we pass invalid -1 to - // prevSetBit below: - return nextDoc(); - } - - prevParentDoc = parentBits.prevSetBit(parentTarget-1); - - //System.out.println(" rolled back to prevParentDoc=" + prevParentDoc + " vs parentDoc=" + parentDoc); - assert prevParentDoc >= parentDoc; - if (prevParentDoc > nextChildDoc) { - nextChildDoc = childIt.advance(prevParentDoc); - // System.out.println(" childScorer advanced to child docID=" + nextChildDoc); - //} else { - //System.out.println(" skip childScorer advance"); - } - - // Parent & child docs are supposed to be orthogonal: - checkOrthogonal(nextChildDoc, prevParentDoc); - - final int nd = nextDoc(); - //System.out.println(" return nextParentDoc=" + nd); - return nd; - } - - @Override - public int docID() { - return parentDoc; - } - - @Override - public long cost() { - return childIt.cost(); - } - }; - } - - private void checkOrthogonal(int childDoc, int parentDoc) { - if (childDoc==parentDoc) { - throw new IllegalStateException("Child query must not match same docs with parent filter. " - + "Combine them as must clauses (+) to find a problem doc. " - + "docId=" + nextChildDoc + ", " + childScorer.getClass()); - + if (parentTwoPhase == null) { + // the approximation is exact + return parentApproximation; + } else { + return TwoPhaseIterator.asDocIdSetIterator(parentTwoPhase); } } + @Override + public TwoPhaseIterator twoPhaseIterator() { + return parentTwoPhase; + } + @Override public int docID() { - return parentDoc; + return parentApproximation.docID(); } @Override public float score() throws IOException { - return parentScore; + setScoreAndFreq(); + return score; } @Override - public int freq() { - return parentFreq; + public int freq() throws IOException { + setScoreAndFreq(); + return freq; + } + + private void setScoreAndFreq() throws IOException { + if (childApproximation.docID() >= parentApproximation.docID()) { + return; + } + double score = scoreMode == ScoreMode.None ? 0 : childScorer.score(); + int freq = 1; + while (childApproximation.nextDoc() < parentApproximation.docID()) { + if (childTwoPhase == null || childTwoPhase.matches()) { + final float childScore = childScorer.score(); + freq += 1; + switch (scoreMode) { + case Total: + case Avg: + score += childScore; + break; + case Min: + score = Math.min(score, childScore); + break; + case Max: + score = Math.min(score, childScore); + break; + case None: + break; + default: + throw new AssertionError(); + } + } + } + if (childApproximation.docID() == parentApproximation.docID() && (childTwoPhase == null || childTwoPhase.matches())) { + throw new IllegalStateException("Child query must not match same docs with parent filter. " + + "Combine them as must clauses (+) to find a problem doc. " + + "docId=" + parentApproximation.docID() + ", " + childScorer.getClass()); + } + if (scoreMode == ScoreMode.Avg) { + score /= freq; + } + this.score = (float) score; + this.freq = freq; } public Explanation explain(LeafReaderContext context, Weight childWeight) throws IOException { + int prevParentDoc = parentBits.prevSetBit(parentApproximation.docID() - 1); int start = context.docBase + prevParentDoc + 1; // +1 b/c prevParentDoc is previous parent doc - int end = context.docBase + parentDoc - 1; // -1 b/c parentDoc is parent doc + int end = context.docBase + parentApproximation.docID() - 1; // -1 b/c parentDoc is parent doc Explanation bestChild = null; int matches = 0; @@ -341,6 +366,7 @@ public class ToParentBlockJoinQuery extends Query { } } + assert freq() == matches; return Explanation.match(score(), String.format(Locale.ROOT, "Score based on %d child docs in range from %d to %d, best match:", matches, start, end), bestChild ); diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java b/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java index f508f84e2fd..a13e66fc21a 100644 --- a/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java +++ b/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java @@ -671,7 +671,7 @@ public class TestBlockJoin extends LuceneTestCase { System.out.println("TEST: iter=" + (1+iter) + " of " + iters); } - final Query childQuery; + Query childQuery; if (random().nextInt(3) == 2) { final int childFieldID = random().nextInt(childFields.length); childQuery = new TermQuery(new Term("child" + childFieldID, @@ -706,6 +706,9 @@ public class TestBlockJoin extends LuceneTestCase { random().nextBoolean() ? BooleanClause.Occur.MUST : BooleanClause.Occur.MUST_NOT); childQuery = bq.build(); } + if (random().nextBoolean()) { + childQuery = new RandomApproximationQuery(childQuery, random()); + } final ScoreMode agg = ScoreMode.values()[random().nextInt(ScoreMode.values().length)]; diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinValidation.java b/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinValidation.java index aa68d09184b..cb3762c0fa7 100644 --- a/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinValidation.java +++ b/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoinValidation.java @@ -87,25 +87,6 @@ public class TestBlockJoinValidation extends LuceneTestCase { assertTrue(expected.getMessage() != null && expected.getMessage().contains("Child query must not match same docs with parent filter")); } - public void testAdvanceValidationForToParentBjq() throws Exception { - int randomChildNumber = getRandomChildNumber(0); - // we need to make advance method meet wrong document, so random child number - // in BJQ must be greater than child number in Boolean clause - int nextRandomChildNumber = getRandomChildNumber(randomChildNumber); - Query parentQueryWithRandomChild = createChildrenQueryWithOneParent(nextRandomChildNumber); - ToParentBlockJoinQuery blockJoinQuery = new ToParentBlockJoinQuery(parentQueryWithRandomChild, parentsFilter, ScoreMode.None); - // advance() method is used by ConjunctionScorer, so we need to create Boolean conjunction query - BooleanQuery.Builder conjunctionQuery = new BooleanQuery.Builder(); - WildcardQuery childQuery = new WildcardQuery(new Term("child", createFieldValue(randomChildNumber))); - conjunctionQuery.add(new BooleanClause(childQuery, BooleanClause.Occur.MUST)); - conjunctionQuery.add(new BooleanClause(blockJoinQuery, BooleanClause.Occur.MUST)); - - IllegalStateException expected = expectThrows(IllegalStateException.class, () -> { - indexSearcher.search(conjunctionQuery.build(), 1); - }); - assertTrue(expected.getMessage() != null && expected.getMessage().contains("Child query must not match same docs with parent filter")); - } - public void testNextDocValidationForToChildBjq() throws Exception { Query parentQueryWithRandomChild = createParentsQueryWithOneChild(getRandomChildNumber(0));