From 1ea8419336a6806463fd9dc83dc77e41a5475649 Mon Sep 17 00:00:00 2001
From: Adrien Grand <jpountz@gmail.com>
Date: Wed, 24 Jul 2019 15:12:57 +0200
Subject: [PATCH] LUCENE-8922: Better impacts for DisjunctionMaxQuery. (#791)

---
 lucene/CHANGES.txt                            |   5 +
 .../lucene/search/DisjunctionMaxScorer.java   |  28 ++--
 ...sjunctionScoreBlockBoundaryPropagator.java | 112 ++++++++++++++++
 ...sjunctionScoreBlockBoundaryPropagator.java | 121 ++++++++++++++++++
 4 files changed, 254 insertions(+), 12 deletions(-)
 create mode 100644 lucene/core/src/java/org/apache/lucene/search/DisjunctionScoreBlockBoundaryPropagator.java
 create mode 100644 lucene/core/src/test/org/apache/lucene/search/TestDisjunctionScoreBlockBoundaryPropagator.java

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 1c80a5e8680..6c1e2a21016 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -71,6 +71,11 @@ Improvements
 * LUCENE-8916: GraphTokenStreamFiniteStrings preserves all Token attributes
   through its finite strings TokenStreams (Alan Woodward)
 
+Optimizations
+
+* LUCENE-8922: DisjunctionMaxQuery more efficiently leverages impacts to skip
+  non-competitive hits. (Adrien Grand)
+
 Other
 
 * LUCENE-8778 LUCENE-8911: Define analyzer SPI names as static final fields and document the names in Javadocs.
diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java
index 82dbda096aa..63d0285ca95 100644
--- a/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java
+++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java
@@ -21,8 +21,6 @@ import java.util.List;
 
 import org.apache.lucene.util.MathUtil;
 
-import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
-
 /**
  * The Scorer for DisjunctionMaxQuery.  The union of all documents generated by the subquery scorers
  * is generated in document number order.  The score for each document is the maximum of the scores computed
@@ -34,6 +32,8 @@ final class DisjunctionMaxScorer extends DisjunctionScorer {
   /* Multiplier applied to non-maximum-scoring subqueries for a document as they are summed into the result. */
   private final float tieBreakerMultiplier;
 
+  private final DisjunctionScoreBlockBoundaryPropagator disjunctionBlockPropagator;
+
   /**
    * Creates a new instance of DisjunctionMaxScorer
    * 
@@ -52,6 +52,11 @@ final class DisjunctionMaxScorer extends DisjunctionScorer {
     if (tieBreakerMultiplier < 0 || tieBreakerMultiplier > 1) {
       throw new IllegalArgumentException("tieBreakerMultiplier must be in [0, 1]");
     }
+    if (scoreMode == ScoreMode.TOP_SCORES) {
+      this.disjunctionBlockPropagator = new DisjunctionScoreBlockBoundaryPropagator(subScorers);
+    } else {
+      this.disjunctionBlockPropagator = null;
+    }
   }
 
   @Override
@@ -72,15 +77,7 @@ final class DisjunctionMaxScorer extends DisjunctionScorer {
 
   @Override
   public int advanceShallow(int target) throws IOException {
-    int upTo = NO_MORE_DOCS;
-    for (Scorer scorer : subScorers) {
-      if (scorer.docID() <= target) {
-        upTo = Math.min(scorer.advanceShallow(target), upTo);
-      } else if (scorer.docID() < NO_MORE_DOCS) {
-        upTo = Math.min(scorer.docID()-1, upTo);
-      }
-    }
-    return upTo;
+    return disjunctionBlockPropagator.advanceShallow(target);
   }
 
   @Override
@@ -112,7 +109,14 @@ final class DisjunctionMaxScorer extends DisjunctionScorer {
   }
 
   @Override
-  public void setMinCompetitiveScore(float minScore) {
+  public void setMinCompetitiveScore(float minScore) throws IOException {
     getBlockMaxApprox().setMinCompetitiveScore(minScore);
+    disjunctionBlockPropagator.setMinCompetitiveScore(minScore);
+    if (tieBreakerMultiplier == 0) {
+      // TODO: we could even remove some scorers from the priority queue?
+      for (Scorer scorer : subScorers) {
+        scorer.setMinCompetitiveScore(minScore);
+      }
+    }
   }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionScoreBlockBoundaryPropagator.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionScoreBlockBoundaryPropagator.java
new file mode 100644
index 00000000000..a8fd206bb31
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionScoreBlockBoundaryPropagator.java
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+
+/**
+ * A helper to propagate block boundaries for disjunctions.
+ * Because a disjunction matches if any of its sub clauses matches, it is
+ * tempting to return the minimum block boundary across all clauses. The problem
+ * is that it might then make the query slow when the minimum competitive score
+ * is high and low-scoring clauses don't drive iteration anymore. So this class
+ * computes block boundaries only across clauses whose maximum score is greater
+ * than or equal to the minimum competitive score, or the maximum scoring clause
+ * if there is no such clause.
+ */
+final class DisjunctionScoreBlockBoundaryPropagator {
+
+  private static final Comparator<Scorer> MAX_SCORE_COMPARATOR = Comparator.comparing((Scorer s) -> {
+    try {
+      return s.getMaxScore(DocIdSetIterator.NO_MORE_DOCS);
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+  }).thenComparing(Comparator.comparing(s -> s.iterator().cost()));
+
+  private final Scorer[] scorers;
+  private final float[] maxScores;
+  private int leadIndex = 0;
+
+  DisjunctionScoreBlockBoundaryPropagator(Collection<Scorer> scorers) throws IOException {
+    this.scorers = scorers.toArray(Scorer[]::new);
+    for (Scorer scorer : this.scorers) {
+      scorer.advanceShallow(0);
+    }
+    Arrays.sort(this.scorers, MAX_SCORE_COMPARATOR);
+
+    maxScores = new float[this.scorers.length];
+    for (int i = 0; i < this.scorers.length; ++i) {
+      maxScores[i] = this.scorers[i].getMaxScore(DocIdSetIterator.NO_MORE_DOCS);
+    }
+  }
+
+  /**
+   * See {@link Scorer#advanceShallow(int)}.
+   */
+  int advanceShallow(int target) throws IOException {
+    // For scorers that are below the lead index, just propagate.
+    for (int i = 0; i < leadIndex; ++i) {
+      Scorer s = scorers[i];
+      if (s.docID() < target) {
+        s.advanceShallow(target);
+      }
+    }
+
+    // For scorers above the lead index, we take the minimum
+    // boundary.
+    Scorer leadScorer = scorers[leadIndex];
+    int upTo = leadScorer.advanceShallow(Math.max(leadScorer.docID(), target));
+
+    for (int i = leadIndex + 1; i < scorers.length; ++i) {
+      Scorer scorer = scorers[i];
+      if (scorer.docID() <= target) {
+        upTo = Math.min(scorer.advanceShallow(target), upTo);
+      }
+    }
+
+    // If the maximum scoring clauses are beyond `target`, then we use their
+    // docID as a boundary. It helps not consider them when computing the
+    // maximum score and get a lower score upper bound.
+    for (int i = scorers.length - 1; i > leadIndex; --i) {
+      Scorer scorer = scorers[i];
+      if (scorer.docID() > target) {
+        upTo = Math.min(upTo, scorer.docID() - 1);
+      } else {
+        break;
+      }
+    }
+
+    return upTo;
+  }
+
+  /**
+   * Set the minimum competitive score to filter out clauses that score less
+   * than this threshold.
+   * @see Scorer#setMinCompetitiveScore
+   */
+  void setMinCompetitiveScore(float minScore) throws IOException {
+    // Update the lead index if necessary
+    while (leadIndex < maxScores.length - 1 && minScore > maxScores[leadIndex]) {
+      leadIndex++;
+    }
+  }
+
+}
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionScoreBlockBoundaryPropagator.java b/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionScoreBlockBoundaryPropagator.java
new file mode 100644
index 00000000000..423e002eaef
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionScoreBlockBoundaryPropagator.java
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestDisjunctionScoreBlockBoundaryPropagator extends LuceneTestCase {
+
+  private static class FakeWeight extends Weight {
+
+    FakeWeight() {
+      super(new MatchNoDocsQuery());
+    }
+
+    @Override
+    public Explanation explain(LeafReaderContext context, int doc) throws IOException {
+      return null;
+    }
+
+    @Override
+    public Scorer scorer(LeafReaderContext context) throws IOException {
+      return null;
+    }
+
+    @Override
+    public boolean isCacheable(LeafReaderContext ctx) {
+      return false;
+    }
+  }
+
+  private static class FakeScorer extends Scorer {
+
+    final int boundary;
+    final float maxScore;
+
+    FakeScorer(int boundary, float maxScore) throws IOException {
+      super(new FakeWeight());
+      this.boundary = boundary;
+      this.maxScore = maxScore;
+    }
+
+    @Override
+    public int docID() {
+      return 0;
+    }
+
+    @Override
+    public float score() {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public DocIdSetIterator iterator() {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public void setMinCompetitiveScore(float minCompetitiveScore) {}
+
+    @Override
+    public float getMaxScore(int upTo) throws IOException {
+      return maxScore;
+    }
+
+    @Override
+    public int advanceShallow(int target) {
+      assert target <= boundary;
+      return boundary;
+    }
+  }
+
+  public void testBasics() throws IOException {
+    Scorer scorer1 = new FakeScorer(20, 0.5f);
+    Scorer scorer2 = new FakeScorer(50, 1.5f);
+    Scorer scorer3 = new FakeScorer(30, 2f);
+    Scorer scorer4 = new FakeScorer(80, 3f);
+    List<Scorer> scorers = Arrays.asList(scorer1, scorer2, scorer3, scorer4);
+    Collections.shuffle(scorers, random());
+    DisjunctionScoreBlockBoundaryPropagator propagator = new DisjunctionScoreBlockBoundaryPropagator(scorers);
+    assertEquals(20, propagator.advanceShallow(0));
+
+    propagator.setMinCompetitiveScore(0.2f);
+    assertEquals(20, propagator.advanceShallow(0));
+
+    propagator.setMinCompetitiveScore(0.7f);
+    assertEquals(30, propagator.advanceShallow(0));
+
+    propagator.setMinCompetitiveScore(1.2f);
+    assertEquals(30, propagator.advanceShallow(0));
+
+    propagator.setMinCompetitiveScore(1.7f);
+    assertEquals(30, propagator.advanceShallow(0));
+
+    propagator.setMinCompetitiveScore(2.2f);
+    assertEquals(80, propagator.advanceShallow(0));
+
+    propagator.setMinCompetitiveScore(5f);
+    assertEquals(80, propagator.advanceShallow(0));
+  }
+
+}