LUCENE-7654: To-parent block joins should implement two-phase iteration.

This commit is contained in:
Adrien Grand 2017-01-30 10:46:22 +01:00
parent 076662d1b2
commit 9d5dc0cf57
4 changed files with 190 additions and 176 deletions

View File

@ -135,6 +135,10 @@ Optimizations
whether BKD cells are entirely within the distance close to the dateline.
(Adrien Grand)
* LUCENE-7654: ToParentBlockJoinQuery now implements two-phase iteration and
computes scores lazily in order to be faster when used in conjunctions.
(Adrien Grand)
Build
* LUCENE-7651: Fix Javadocs build for Java 8u121 by injecting "Google Code

View File

@ -20,15 +20,18 @@ import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Locale;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.FilterWeight;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.FilterWeight;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.ScorerSupplier;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.BitSet;
@ -116,33 +119,44 @@ public class ToParentBlockJoinQuery extends Query {
this.scoreMode = scoreMode;
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
final ScorerSupplier scorerSupplier = scorerSupplier(context);
if (scorerSupplier == null) {
return null;
}
return scorerSupplier.get(false);
}
// NOTE: acceptDocs applies (and is checked) only in the
// parent document space
@Override
public Scorer scorer(LeafReaderContext readerContext) throws IOException {
final Scorer childScorer = in.scorer(readerContext);
if (childScorer == null) {
// No matches
return null;
}
final int firstChildDoc = childScorer.iterator().nextDoc();
if (firstChildDoc == DocIdSetIterator.NO_MORE_DOCS) {
// No matches
public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
final ScorerSupplier childScorerSupplier = in.scorerSupplier(context);
if (childScorerSupplier == null) {
return null;
}
// NOTE: this does not take accept docs into account, the responsibility
// to not match deleted docs is on the scorer
final BitSet parents = parentsFilter.getBitSet(readerContext);
final BitSet parents = parentsFilter.getBitSet(context);
if (parents == null) {
// No matches
return null;
}
return new BlockJoinScorer(this, childScorer, parents, firstChildDoc, scoreMode);
return new ScorerSupplier() {
@Override
public Scorer get(boolean randomAccess) throws IOException {
return new BlockJoinScorer(BlockJoinWeight.this, childScorerSupplier.get(randomAccess), parents, scoreMode);
}
@Override
public long cost() {
return childScorerSupplier.cost();
}
};
}
@Override
@ -155,24 +169,107 @@ public class ToParentBlockJoinQuery extends Query {
}
}
private static class ParentApproximation extends DocIdSetIterator {
private final DocIdSetIterator childApproximation;
private final BitSet parentBits;
private int doc = -1;
ParentApproximation(DocIdSetIterator childApproximation, BitSet parentBits) {
this.childApproximation = childApproximation;
this.parentBits = parentBits;
}
@Override
public int docID() {
return doc;
}
@Override
public int nextDoc() throws IOException {
return advance(doc + 1);
}
@Override
public int advance(int target) throws IOException {
if (target >= parentBits.length()) {
return doc = NO_MORE_DOCS;
}
final int firstChildTarget = target == 0 ? 0 : parentBits.prevSetBit(target - 1) + 1;
int childDoc = childApproximation.docID();
if (childDoc < firstChildTarget) {
childDoc = childApproximation.advance(firstChildTarget);
}
if (childDoc >= parentBits.length() - 1) {
return doc = NO_MORE_DOCS;
}
return doc = parentBits.nextSetBit(childDoc + 1);
}
@Override
public long cost() {
return childApproximation.cost();
}
}
private static class ParentTwoPhase extends TwoPhaseIterator {
private final ParentApproximation parentApproximation;
private final DocIdSetIterator childApproximation;
private final TwoPhaseIterator childTwoPhase;
ParentTwoPhase(ParentApproximation parentApproximation, TwoPhaseIterator childTwoPhase) {
super(parentApproximation);
this.parentApproximation = parentApproximation;
this.childApproximation = childTwoPhase.approximation();
this.childTwoPhase = childTwoPhase;
}
@Override
public boolean matches() throws IOException {
assert childApproximation.docID() < parentApproximation.docID();
do {
if (childTwoPhase.matches()) {
return true;
}
} while (childApproximation.nextDoc() < parentApproximation.docID());
return false;
}
@Override
public float matchCost() {
// TODO: how could we compute a match cost?
return childTwoPhase.matchCost() + 10;
}
}
static class BlockJoinScorer extends Scorer {
private final Scorer childScorer;
private final BitSet parentBits;
private final ScoreMode scoreMode;
private int parentDoc = -1;
private int prevParentDoc;
private float parentScore;
private int parentFreq;
private int nextChildDoc;
private int childDocUpto;
private final DocIdSetIterator childApproximation;
private final TwoPhaseIterator childTwoPhase;
private final ParentApproximation parentApproximation;
private final ParentTwoPhase parentTwoPhase;
private float score;
private int freq;
public BlockJoinScorer(Weight weight, Scorer childScorer, BitSet parentBits, int firstChildDoc, ScoreMode scoreMode) {
public BlockJoinScorer(Weight weight, Scorer childScorer, BitSet parentBits, ScoreMode scoreMode) {
super(weight);
//System.out.println("Q.init firstChildDoc=" + firstChildDoc);
this.parentBits = parentBits;
this.childScorer = childScorer;
this.scoreMode = scoreMode;
nextChildDoc = firstChildDoc;
childTwoPhase = childScorer.twoPhaseIterator();
if (childTwoPhase == null) {
childApproximation = childScorer.iterator();
parentApproximation = new ParentApproximation(childApproximation, parentBits);
parentTwoPhase = null;
} else {
childApproximation = childTwoPhase.approximation();
parentApproximation = new ParentApproximation(childTwoPhase.approximation(), parentBits);
parentTwoPhase = new ParentTwoPhase(parentApproximation, childTwoPhase);
}
}
@Override
@ -182,152 +279,80 @@ public class ToParentBlockJoinQuery extends Query {
@Override
public DocIdSetIterator iterator() {
return new DocIdSetIterator() {
final DocIdSetIterator childIt = childScorer.iterator();
@Override
public int nextDoc() throws IOException {
//System.out.println("Q.nextDoc() nextChildDoc=" + nextChildDoc);
if (nextChildDoc == NO_MORE_DOCS) {
//System.out.println(" end");
return parentDoc = NO_MORE_DOCS;
if (parentTwoPhase == null) {
// the approximation is exact
return parentApproximation;
} else {
return TwoPhaseIterator.asDocIdSetIterator(parentTwoPhase);
}
// Gather all children sharing the same parent as
// nextChildDoc
parentDoc = parentBits.nextSetBit(nextChildDoc);
// Parent & child docs are supposed to be
// orthogonal:
checkOrthogonal(nextChildDoc, parentDoc);
//System.out.println(" parentDoc=" + parentDoc);
assert parentDoc != DocIdSetIterator.NO_MORE_DOCS;
float totalScore = 0;
float maxScore = Float.NEGATIVE_INFINITY;
float minScore = Float.POSITIVE_INFINITY;
childDocUpto = 0;
parentFreq = 0;
do {
//System.out.println(" c=" + nextChildDoc);
if (scoreMode != ScoreMode.None) {
// TODO: specialize this into dedicated classes per-scoreMode
final float childScore = childScorer.score();
final int childFreq = childScorer.freq();
maxScore = Math.max(childScore, maxScore);
minScore = Math.min(childScore, minScore);
totalScore += childScore;
parentFreq += childFreq;
}
childDocUpto++;
nextChildDoc = childIt.nextDoc();
} while (nextChildDoc < parentDoc);
// Parent & child docs are supposed to be
// orthogonal:
checkOrthogonal(nextChildDoc, parentDoc);
switch(scoreMode) {
case Avg:
parentScore = totalScore / childDocUpto;
break;
case Max:
parentScore = maxScore;
break;
case Min:
parentScore = minScore;
break;
case Total:
parentScore = totalScore;
break;
case None:
break;
}
//System.out.println(" return parentDoc=" + parentDoc + " childDocUpto=" + childDocUpto);
return parentDoc;
}
@Override
public int advance(int parentTarget) throws IOException {
//System.out.println("Q.advance parentTarget=" + parentTarget);
if (parentTarget == NO_MORE_DOCS) {
return parentDoc = NO_MORE_DOCS;
}
if (parentTarget == 0) {
// Callers should only be passing in a docID from
// the parent space, so this means this parent
// has no children (it got docID 0), so it cannot
// possibly match. We must handle this case
// separately otherwise we pass invalid -1 to
// prevSetBit below:
return nextDoc();
}
prevParentDoc = parentBits.prevSetBit(parentTarget-1);
//System.out.println(" rolled back to prevParentDoc=" + prevParentDoc + " vs parentDoc=" + parentDoc);
assert prevParentDoc >= parentDoc;
if (prevParentDoc > nextChildDoc) {
nextChildDoc = childIt.advance(prevParentDoc);
// System.out.println(" childScorer advanced to child docID=" + nextChildDoc);
//} else {
//System.out.println(" skip childScorer advance");
}
// Parent & child docs are supposed to be orthogonal:
checkOrthogonal(nextChildDoc, prevParentDoc);
final int nd = nextDoc();
//System.out.println(" return nextParentDoc=" + nd);
return nd;
public TwoPhaseIterator twoPhaseIterator() {
return parentTwoPhase;
}
@Override
public int docID() {
return parentDoc;
}
@Override
public long cost() {
return childIt.cost();
}
};
}
private void checkOrthogonal(int childDoc, int parentDoc) {
if (childDoc==parentDoc) {
throw new IllegalStateException("Child query must not match same docs with parent filter. "
+ "Combine them as must clauses (+) to find a problem doc. "
+ "docId=" + nextChildDoc + ", " + childScorer.getClass());
}
}
@Override
public int docID() {
return parentDoc;
return parentApproximation.docID();
}
@Override
public float score() throws IOException {
return parentScore;
setScoreAndFreq();
return score;
}
@Override
public int freq() {
return parentFreq;
public int freq() throws IOException {
setScoreAndFreq();
return freq;
}
private void setScoreAndFreq() throws IOException {
if (childApproximation.docID() >= parentApproximation.docID()) {
return;
}
double score = scoreMode == ScoreMode.None ? 0 : childScorer.score();
int freq = 1;
while (childApproximation.nextDoc() < parentApproximation.docID()) {
if (childTwoPhase == null || childTwoPhase.matches()) {
final float childScore = childScorer.score();
freq += 1;
switch (scoreMode) {
case Total:
case Avg:
score += childScore;
break;
case Min:
score = Math.min(score, childScore);
break;
case Max:
score = Math.min(score, childScore);
break;
case None:
break;
default:
throw new AssertionError();
}
}
}
if (childApproximation.docID() == parentApproximation.docID() && (childTwoPhase == null || childTwoPhase.matches())) {
throw new IllegalStateException("Child query must not match same docs with parent filter. "
+ "Combine them as must clauses (+) to find a problem doc. "
+ "docId=" + parentApproximation.docID() + ", " + childScorer.getClass());
}
if (scoreMode == ScoreMode.Avg) {
score /= freq;
}
this.score = (float) score;
this.freq = freq;
}
public Explanation explain(LeafReaderContext context, Weight childWeight) throws IOException {
int prevParentDoc = parentBits.prevSetBit(parentApproximation.docID() - 1);
int start = context.docBase + prevParentDoc + 1; // +1 b/c prevParentDoc is previous parent doc
int end = context.docBase + parentDoc - 1; // -1 b/c parentDoc is parent doc
int end = context.docBase + parentApproximation.docID() - 1; // -1 b/c parentDoc is parent doc
Explanation bestChild = null;
int matches = 0;
@ -341,6 +366,7 @@ public class ToParentBlockJoinQuery extends Query {
}
}
assert freq() == matches;
return Explanation.match(score(), String.format(Locale.ROOT,
"Score based on %d child docs in range from %d to %d, best match:", matches, start, end), bestChild
);

View File

@ -671,7 +671,7 @@ public class TestBlockJoin extends LuceneTestCase {
System.out.println("TEST: iter=" + (1+iter) + " of " + iters);
}
final Query childQuery;
Query childQuery;
if (random().nextInt(3) == 2) {
final int childFieldID = random().nextInt(childFields.length);
childQuery = new TermQuery(new Term("child" + childFieldID,
@ -706,6 +706,9 @@ public class TestBlockJoin extends LuceneTestCase {
random().nextBoolean() ? BooleanClause.Occur.MUST : BooleanClause.Occur.MUST_NOT);
childQuery = bq.build();
}
if (random().nextBoolean()) {
childQuery = new RandomApproximationQuery(childQuery, random());
}
final ScoreMode agg = ScoreMode.values()[random().nextInt(ScoreMode.values().length)];

View File

@ -87,25 +87,6 @@ public class TestBlockJoinValidation extends LuceneTestCase {
assertTrue(expected.getMessage() != null && expected.getMessage().contains("Child query must not match same docs with parent filter"));
}
public void testAdvanceValidationForToParentBjq() throws Exception {
int randomChildNumber = getRandomChildNumber(0);
// we need to make advance method meet wrong document, so random child number
// in BJQ must be greater than child number in Boolean clause
int nextRandomChildNumber = getRandomChildNumber(randomChildNumber);
Query parentQueryWithRandomChild = createChildrenQueryWithOneParent(nextRandomChildNumber);
ToParentBlockJoinQuery blockJoinQuery = new ToParentBlockJoinQuery(parentQueryWithRandomChild, parentsFilter, ScoreMode.None);
// advance() method is used by ConjunctionScorer, so we need to create Boolean conjunction query
BooleanQuery.Builder conjunctionQuery = new BooleanQuery.Builder();
WildcardQuery childQuery = new WildcardQuery(new Term("child", createFieldValue(randomChildNumber)));
conjunctionQuery.add(new BooleanClause(childQuery, BooleanClause.Occur.MUST));
conjunctionQuery.add(new BooleanClause(blockJoinQuery, BooleanClause.Occur.MUST));
IllegalStateException expected = expectThrows(IllegalStateException.class, () -> {
indexSearcher.search(conjunctionQuery.build(), 1);
});
assertTrue(expected.getMessage() != null && expected.getMessage().contains("Child query must not match same docs with parent filter"));
}
public void testNextDocValidationForToChildBjq() throws Exception {
Query parentQueryWithRandomChild = createParentsQueryWithOneChild(getRandomChildNumber(0));