From 1ea8419336a6806463fd9dc83dc77e41a5475649 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Wed, 24 Jul 2019 15:12:57 +0200 Subject: [PATCH] LUCENE-8922: Better impacts for DisjunctionMaxQuery. (#791) --- lucene/CHANGES.txt | 5 + .../lucene/search/DisjunctionMaxScorer.java | 28 ++-- ...sjunctionScoreBlockBoundaryPropagator.java | 112 ++++++++++++++++ ...sjunctionScoreBlockBoundaryPropagator.java | 121 ++++++++++++++++++ 4 files changed, 254 insertions(+), 12 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/search/DisjunctionScoreBlockBoundaryPropagator.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/TestDisjunctionScoreBlockBoundaryPropagator.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 1c80a5e8680..6c1e2a21016 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -71,6 +71,11 @@ Improvements * LUCENE-8916: GraphTokenStreamFiniteStrings preserves all Token attributes through its finite strings TokenStreams (Alan Woodward) +Optimizations + +* LUCENE-8922: DisjunctionMaxQuery more efficiently leverages impacts to skip + non-competitive hits. (Adrien Grand) + Other * LUCENE-8778 LUCENE-8911: Define analyzer SPI names as static final fields and document the names in Javadocs. diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java index 82dbda096aa..63d0285ca95 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java @@ -21,8 +21,6 @@ import java.util.List; import org.apache.lucene.util.MathUtil; -import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; - /** * The Scorer for DisjunctionMaxQuery. The union of all documents generated by the subquery scorers * is generated in document number order. The score for each document is the maximum of the scores computed @@ -34,6 +32,8 @@ final class DisjunctionMaxScorer extends DisjunctionScorer { /* Multiplier applied to non-maximum-scoring subqueries for a document as they are summed into the result. */ private final float tieBreakerMultiplier; + private final DisjunctionScoreBlockBoundaryPropagator disjunctionBlockPropagator; + /** * Creates a new instance of DisjunctionMaxScorer * @@ -52,6 +52,11 @@ final class DisjunctionMaxScorer extends DisjunctionScorer { if (tieBreakerMultiplier < 0 || tieBreakerMultiplier > 1) { throw new IllegalArgumentException("tieBreakerMultiplier must be in [0, 1]"); } + if (scoreMode == ScoreMode.TOP_SCORES) { + this.disjunctionBlockPropagator = new DisjunctionScoreBlockBoundaryPropagator(subScorers); + } else { + this.disjunctionBlockPropagator = null; + } } @Override @@ -72,15 +77,7 @@ final class DisjunctionMaxScorer extends DisjunctionScorer { @Override public int advanceShallow(int target) throws IOException { - int upTo = NO_MORE_DOCS; - for (Scorer scorer : subScorers) { - if (scorer.docID() <= target) { - upTo = Math.min(scorer.advanceShallow(target), upTo); - } else if (scorer.docID() < NO_MORE_DOCS) { - upTo = Math.min(scorer.docID()-1, upTo); - } - } - return upTo; + return disjunctionBlockPropagator.advanceShallow(target); } @Override @@ -112,7 +109,14 @@ final class DisjunctionMaxScorer extends DisjunctionScorer { } @Override - public void setMinCompetitiveScore(float minScore) { + public void setMinCompetitiveScore(float minScore) throws IOException { getBlockMaxApprox().setMinCompetitiveScore(minScore); + disjunctionBlockPropagator.setMinCompetitiveScore(minScore); + if (tieBreakerMultiplier == 0) { + // TODO: we could even remove some scorers from the priority queue? + for (Scorer scorer : subScorers) { + scorer.setMinCompetitiveScore(minScore); + } + } } } diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionScoreBlockBoundaryPropagator.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionScoreBlockBoundaryPropagator.java new file mode 100644 index 00000000000..a8fd206bb31 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionScoreBlockBoundaryPropagator.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; +import java.util.Comparator; + +/** + * A helper to propagate block boundaries for disjunctions. + * Because a disjunction matches if any of its sub clauses matches, it is + * tempting to return the minimum block boundary across all clauses. The problem + * is that it might then make the query slow when the minimum competitive score + * is high and low-scoring clauses don't drive iteration anymore. So this class + * computes block boundaries only across clauses whose maximum score is greater + * than or equal to the minimum competitive score, or the maximum scoring clause + * if there is no such clause. + */ +final class DisjunctionScoreBlockBoundaryPropagator { + + private static final Comparator MAX_SCORE_COMPARATOR = Comparator.comparing((Scorer s) -> { + try { + return s.getMaxScore(DocIdSetIterator.NO_MORE_DOCS); + } catch (IOException e) { + throw new RuntimeException(e); + } + }).thenComparing(Comparator.comparing(s -> s.iterator().cost())); + + private final Scorer[] scorers; + private final float[] maxScores; + private int leadIndex = 0; + + DisjunctionScoreBlockBoundaryPropagator(Collection scorers) throws IOException { + this.scorers = scorers.toArray(Scorer[]::new); + for (Scorer scorer : this.scorers) { + scorer.advanceShallow(0); + } + Arrays.sort(this.scorers, MAX_SCORE_COMPARATOR); + + maxScores = new float[this.scorers.length]; + for (int i = 0; i < this.scorers.length; ++i) { + maxScores[i] = this.scorers[i].getMaxScore(DocIdSetIterator.NO_MORE_DOCS); + } + } + + /** + * See {@link Scorer#advanceShallow(int)}. + */ + int advanceShallow(int target) throws IOException { + // For scorers that are below the lead index, just propagate. + for (int i = 0; i < leadIndex; ++i) { + Scorer s = scorers[i]; + if (s.docID() < target) { + s.advanceShallow(target); + } + } + + // For scorers above the lead index, we take the minimum + // boundary. + Scorer leadScorer = scorers[leadIndex]; + int upTo = leadScorer.advanceShallow(Math.max(leadScorer.docID(), target)); + + for (int i = leadIndex + 1; i < scorers.length; ++i) { + Scorer scorer = scorers[i]; + if (scorer.docID() <= target) { + upTo = Math.min(scorer.advanceShallow(target), upTo); + } + } + + // If the maximum scoring clauses are beyond `target`, then we use their + // docID as a boundary. It helps not consider them when computing the + // maximum score and get a lower score upper bound. + for (int i = scorers.length - 1; i > leadIndex; --i) { + Scorer scorer = scorers[i]; + if (scorer.docID() > target) { + upTo = Math.min(upTo, scorer.docID() - 1); + } else { + break; + } + } + + return upTo; + } + + /** + * Set the minimum competitive score to filter out clauses that score less + * than this threshold. + * @see Scorer#setMinCompetitiveScore + */ + void setMinCompetitiveScore(float minScore) throws IOException { + // Update the lead index if necessary + while (leadIndex < maxScores.length - 1 && minScore > maxScores[leadIndex]) { + leadIndex++; + } + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionScoreBlockBoundaryPropagator.java b/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionScoreBlockBoundaryPropagator.java new file mode 100644 index 00000000000..423e002eaef --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionScoreBlockBoundaryPropagator.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.util.LuceneTestCase; + +public class TestDisjunctionScoreBlockBoundaryPropagator extends LuceneTestCase { + + private static class FakeWeight extends Weight { + + FakeWeight() { + super(new MatchNoDocsQuery()); + } + + @Override + public Explanation explain(LeafReaderContext context, int doc) throws IOException { + return null; + } + + @Override + public Scorer scorer(LeafReaderContext context) throws IOException { + return null; + } + + @Override + public boolean isCacheable(LeafReaderContext ctx) { + return false; + } + } + + private static class FakeScorer extends Scorer { + + final int boundary; + final float maxScore; + + FakeScorer(int boundary, float maxScore) throws IOException { + super(new FakeWeight()); + this.boundary = boundary; + this.maxScore = maxScore; + } + + @Override + public int docID() { + return 0; + } + + @Override + public float score() { + throw new UnsupportedOperationException(); + } + + @Override + public DocIdSetIterator iterator() { + throw new UnsupportedOperationException(); + } + + @Override + public void setMinCompetitiveScore(float minCompetitiveScore) {} + + @Override + public float getMaxScore(int upTo) throws IOException { + return maxScore; + } + + @Override + public int advanceShallow(int target) { + assert target <= boundary; + return boundary; + } + } + + public void testBasics() throws IOException { + Scorer scorer1 = new FakeScorer(20, 0.5f); + Scorer scorer2 = new FakeScorer(50, 1.5f); + Scorer scorer3 = new FakeScorer(30, 2f); + Scorer scorer4 = new FakeScorer(80, 3f); + List scorers = Arrays.asList(scorer1, scorer2, scorer3, scorer4); + Collections.shuffle(scorers, random()); + DisjunctionScoreBlockBoundaryPropagator propagator = new DisjunctionScoreBlockBoundaryPropagator(scorers); + assertEquals(20, propagator.advanceShallow(0)); + + propagator.setMinCompetitiveScore(0.2f); + assertEquals(20, propagator.advanceShallow(0)); + + propagator.setMinCompetitiveScore(0.7f); + assertEquals(30, propagator.advanceShallow(0)); + + propagator.setMinCompetitiveScore(1.2f); + assertEquals(30, propagator.advanceShallow(0)); + + propagator.setMinCompetitiveScore(1.7f); + assertEquals(30, propagator.advanceShallow(0)); + + propagator.setMinCompetitiveScore(2.2f); + assertEquals(80, propagator.advanceShallow(0)); + + propagator.setMinCompetitiveScore(5f); + assertEquals(80, propagator.advanceShallow(0)); + } + +}