LUCENE-8922: Better impacts for DisjunctionMaxQuery. (#791)

This commit is contained in:
Adrien Grand 2019-07-24 15:12:57 +02:00 committed by GitHub
parent 85814e262c
commit 1ea8419336
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 254 additions and 12 deletions

View File

@ -71,6 +71,11 @@ Improvements
* LUCENE-8916: GraphTokenStreamFiniteStrings preserves all Token attributes
through its finite strings TokenStreams (Alan Woodward)
Optimizations
* LUCENE-8922: DisjunctionMaxQuery more efficiently leverages impacts to skip
non-competitive hits. (Adrien Grand)
Other
* LUCENE-8778 LUCENE-8911: Define analyzer SPI names as static final fields and document the names in Javadocs.

View File

@ -21,8 +21,6 @@ import java.util.List;
import org.apache.lucene.util.MathUtil;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
/**
* The Scorer for DisjunctionMaxQuery. The union of all documents generated by the subquery scorers
* is generated in document number order. The score for each document is the maximum of the scores computed
@ -34,6 +32,8 @@ final class DisjunctionMaxScorer extends DisjunctionScorer {
/* Multiplier applied to non-maximum-scoring subqueries for a document as they are summed into the result. */
private final float tieBreakerMultiplier;
private final DisjunctionScoreBlockBoundaryPropagator disjunctionBlockPropagator;
/**
* Creates a new instance of DisjunctionMaxScorer
*
@ -52,6 +52,11 @@ final class DisjunctionMaxScorer extends DisjunctionScorer {
if (tieBreakerMultiplier < 0 || tieBreakerMultiplier > 1) {
throw new IllegalArgumentException("tieBreakerMultiplier must be in [0, 1]");
}
if (scoreMode == ScoreMode.TOP_SCORES) {
this.disjunctionBlockPropagator = new DisjunctionScoreBlockBoundaryPropagator(subScorers);
} else {
this.disjunctionBlockPropagator = null;
}
}
@Override
@ -72,15 +77,7 @@ final class DisjunctionMaxScorer extends DisjunctionScorer {
@Override
public int advanceShallow(int target) throws IOException {
int upTo = NO_MORE_DOCS;
for (Scorer scorer : subScorers) {
if (scorer.docID() <= target) {
upTo = Math.min(scorer.advanceShallow(target), upTo);
} else if (scorer.docID() < NO_MORE_DOCS) {
upTo = Math.min(scorer.docID()-1, upTo);
}
}
return upTo;
return disjunctionBlockPropagator.advanceShallow(target);
}
@Override
@ -112,7 +109,14 @@ final class DisjunctionMaxScorer extends DisjunctionScorer {
}
@Override
public void setMinCompetitiveScore(float minScore) {
public void setMinCompetitiveScore(float minScore) throws IOException {
getBlockMaxApprox().setMinCompetitiveScore(minScore);
disjunctionBlockPropagator.setMinCompetitiveScore(minScore);
if (tieBreakerMultiplier == 0) {
// TODO: we could even remove some scorers from the priority queue?
for (Scorer scorer : subScorers) {
scorer.setMinCompetitiveScore(minScore);
}
}
}
}

View File

@ -0,0 +1,112 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
/**
* A helper to propagate block boundaries for disjunctions.
* Because a disjunction matches if any of its sub clauses matches, it is
* tempting to return the minimum block boundary across all clauses. The problem
* is that it might then make the query slow when the minimum competitive score
* is high and low-scoring clauses don't drive iteration anymore. So this class
* computes block boundaries only across clauses whose maximum score is greater
* than or equal to the minimum competitive score, or the maximum scoring clause
* if there is no such clause.
*/
final class DisjunctionScoreBlockBoundaryPropagator {
private static final Comparator<Scorer> MAX_SCORE_COMPARATOR = Comparator.comparing((Scorer s) -> {
try {
return s.getMaxScore(DocIdSetIterator.NO_MORE_DOCS);
} catch (IOException e) {
throw new RuntimeException(e);
}
}).thenComparing(Comparator.comparing(s -> s.iterator().cost()));
private final Scorer[] scorers;
private final float[] maxScores;
private int leadIndex = 0;
DisjunctionScoreBlockBoundaryPropagator(Collection<Scorer> scorers) throws IOException {
this.scorers = scorers.toArray(Scorer[]::new);
for (Scorer scorer : this.scorers) {
scorer.advanceShallow(0);
}
Arrays.sort(this.scorers, MAX_SCORE_COMPARATOR);
maxScores = new float[this.scorers.length];
for (int i = 0; i < this.scorers.length; ++i) {
maxScores[i] = this.scorers[i].getMaxScore(DocIdSetIterator.NO_MORE_DOCS);
}
}
/**
* See {@link Scorer#advanceShallow(int)}.
*/
int advanceShallow(int target) throws IOException {
// For scorers that are below the lead index, just propagate.
for (int i = 0; i < leadIndex; ++i) {
Scorer s = scorers[i];
if (s.docID() < target) {
s.advanceShallow(target);
}
}
// For scorers above the lead index, we take the minimum
// boundary.
Scorer leadScorer = scorers[leadIndex];
int upTo = leadScorer.advanceShallow(Math.max(leadScorer.docID(), target));
for (int i = leadIndex + 1; i < scorers.length; ++i) {
Scorer scorer = scorers[i];
if (scorer.docID() <= target) {
upTo = Math.min(scorer.advanceShallow(target), upTo);
}
}
// If the maximum scoring clauses are beyond `target`, then we use their
// docID as a boundary. It helps not consider them when computing the
// maximum score and get a lower score upper bound.
for (int i = scorers.length - 1; i > leadIndex; --i) {
Scorer scorer = scorers[i];
if (scorer.docID() > target) {
upTo = Math.min(upTo, scorer.docID() - 1);
} else {
break;
}
}
return upTo;
}
/**
* Set the minimum competitive score to filter out clauses that score less
* than this threshold.
* @see Scorer#setMinCompetitiveScore
*/
void setMinCompetitiveScore(float minScore) throws IOException {
// Update the lead index if necessary
while (leadIndex < maxScores.length - 1 && minScore > maxScores[leadIndex]) {
leadIndex++;
}
}
}

View File

@ -0,0 +1,121 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.util.LuceneTestCase;
public class TestDisjunctionScoreBlockBoundaryPropagator extends LuceneTestCase {
private static class FakeWeight extends Weight {
FakeWeight() {
super(new MatchNoDocsQuery());
}
@Override
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
return null;
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
return null;
}
@Override
public boolean isCacheable(LeafReaderContext ctx) {
return false;
}
}
private static class FakeScorer extends Scorer {
final int boundary;
final float maxScore;
FakeScorer(int boundary, float maxScore) throws IOException {
super(new FakeWeight());
this.boundary = boundary;
this.maxScore = maxScore;
}
@Override
public int docID() {
return 0;
}
@Override
public float score() {
throw new UnsupportedOperationException();
}
@Override
public DocIdSetIterator iterator() {
throw new UnsupportedOperationException();
}
@Override
public void setMinCompetitiveScore(float minCompetitiveScore) {}
@Override
public float getMaxScore(int upTo) throws IOException {
return maxScore;
}
@Override
public int advanceShallow(int target) {
assert target <= boundary;
return boundary;
}
}
public void testBasics() throws IOException {
Scorer scorer1 = new FakeScorer(20, 0.5f);
Scorer scorer2 = new FakeScorer(50, 1.5f);
Scorer scorer3 = new FakeScorer(30, 2f);
Scorer scorer4 = new FakeScorer(80, 3f);
List<Scorer> scorers = Arrays.asList(scorer1, scorer2, scorer3, scorer4);
Collections.shuffle(scorers, random());
DisjunctionScoreBlockBoundaryPropagator propagator = new DisjunctionScoreBlockBoundaryPropagator(scorers);
assertEquals(20, propagator.advanceShallow(0));
propagator.setMinCompetitiveScore(0.2f);
assertEquals(20, propagator.advanceShallow(0));
propagator.setMinCompetitiveScore(0.7f);
assertEquals(30, propagator.advanceShallow(0));
propagator.setMinCompetitiveScore(1.2f);
assertEquals(30, propagator.advanceShallow(0));
propagator.setMinCompetitiveScore(1.7f);
assertEquals(30, propagator.advanceShallow(0));
propagator.setMinCompetitiveScore(2.2f);
assertEquals(80, propagator.advanceShallow(0));
propagator.setMinCompetitiveScore(5f);
assertEquals(80, propagator.advanceShallow(0));
}
}