Further optimize DrillSideways scoring (#11881)

2022-11-08 10:08:12 -08:00 · 2022-11-08 10:08:12 -08:00 · c66a559050
parent f9c26ed501
commit c66a559050
3 changed files with 178 additions and 90 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -160,6 +160,9 @@ Optimizations
 * GITHUB#11876: Use ByteArrayComparator to speed up PointInSetQuery in single dimension case.
  (Guo Feng)

+* GITHUB#11881: Further optimize drill-sideways scoring by specializing the single dimension case
+  and borrowing some concepts from "min should match" scoring. (Greg Miller)
+
 * GITHUB#11884: Simplify the logic of matchAll() in IndexSortSortedNumericDocValuesRangeQuery. (Lu Xugang)

 Other
--- a/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java
@ -18,13 +18,11 @@ package org.apache.lucene.facet;

 import java.io.IOException;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.List;
 import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.search.BulkScorer;
 import org.apache.lucene.search.Collector;
 import org.apache.lucene.search.DocIdSetIterator;
@ -36,19 +34,15 @@ import org.apache.lucene.search.TwoPhaseIterator;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.CollectionUtil;
 import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.PriorityQueue;

 class DrillSidewaysScorer extends BulkScorer {

  private static final Comparator<DocsAndCost> APPROXIMATION_COMPARATOR =
-      Comparator.comparingLong(e -> e.approximation.cost());
+      Comparator.comparingLong(e -> e.cost);

  private static final Comparator<DocsAndCost> TWO_PHASE_COMPARATOR =
-      new Comparator<DocsAndCost>() {
-        @Override
-        public int compare(DocsAndCost o1, DocsAndCost o2) {
-          return Float.compare(o1.twoPhase.matchCost(), o2.twoPhase.matchCost());
-        }
-      };
+      (a, b) -> Float.compare(a.matchCost, b.matchCost);

  // private static boolean DEBUG = false;

@ -167,34 +161,13 @@ class DrillSidewaysScorer extends BulkScorer {
  }

  /**
-   * Used when base query is highly constraining vs the drilldowns, or when the docs must be scored
-   * at once (i.e., like BooleanScorer2, not BooleanScorer). In this case we just .next() on base
-   * and .advance() on the dim filters.
+   * Query-first scoring specialization when there is only one drill-sideways dimension, which is
+   * likely a common scenario.
   */
-  private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims)
-      throws IOException {
-    setScorer(collector, ScoreCachingWrappingScorer.wrap(baseScorer));
-
-    List<DocsAndCost> allDims = Arrays.asList(dims);
-    CollectionUtil.timSort(allDims, APPROXIMATION_COMPARATOR);
-
-    List<DocsAndCost> twoPhaseDims = null;
-    for (DocsAndCost dim : dims) {
-      if (dim.twoPhase != null) {
-        if (twoPhaseDims == null) {
-          twoPhaseDims = new ArrayList<>(dims.length);
-        }
-        twoPhaseDims.add(dim);
-      }
-    }
-    if (twoPhaseDims != null) {
-      CollectionUtil.timSort(twoPhaseDims, TWO_PHASE_COMPARATOR);
-    }
-
+  private void doQueryFirstScoringSingleDim(
+      Bits acceptDocs, LeafCollector collector, DocsAndCost dim) throws IOException {
    int docID = baseApproximation.docID();
-
-    nextDoc:
-    while (docID != PostingsEnum.NO_MORE_DOCS) {
+    while (docID != DocIdSetIterator.NO_MORE_DOCS) {
      assert docID == baseApproximation.docID();

      if (acceptDocs != null && acceptDocs.get(docID) == false) {
@ -202,38 +175,128 @@ class DrillSidewaysScorer extends BulkScorer {
        continue;
      }

-      DocsAndCost failedDim = null;
-      for (DocsAndCost dim : allDims) {
-        final int dimDocID;
-        if (dim.approximation.docID() < docID) {
-          dimDocID = dim.approximation.advance(docID);
-        } else {
-          dimDocID = dim.approximation.docID();
-        }
-        if (dimDocID != docID) {
-          if (failedDim != null) {
-            int next = Math.min(dimDocID, failedDim.approximation.docID());
-            docID = baseApproximation.advance(next);
-            continue nextDoc;
-          } else {
-            failedDim = dim;
-          }
-        }
-      }
-
      if (baseTwoPhase != null && baseTwoPhase.matches() == false) {
        docID = baseApproximation.nextDoc();
        continue;
      }

-      if (twoPhaseDims != null) {
+      // We have either a near-miss or full match. Check the sideways dim to see which it is:
+      collectDocID = docID;
+      if (advanceIfBehind(docID, dim.approximation) != docID
+          || (dim.twoPhase != null && dim.twoPhase.matches() == false)) {
+        // The sideways dim missed, so we have a "near miss":
+        collectNearMiss(dim.sidewaysLeafCollector);
+      } else {
+        // Hit passed all filters, so it's "real":
+        collectHit(collector, dim);
+      }
+
+      docID = baseApproximation.nextDoc();
+    }
+  }
+
+  /**
+   * Used when base query is highly constraining vs the drilldowns, or when the docs must be scored
+   * at once (i.e., like BooleanScorer2, not BooleanScorer).
+   */
+  private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims)
+      throws IOException {
+    setScorer(collector, ScoreCachingWrappingScorer.wrap(baseScorer));
+
+    // Specialize the single-dim use-case as we have a more efficient implementation for that:
+    if (dims.length == 1) {
+      doQueryFirstScoringSingleDim(acceptDocs, collector, dims[0]);
+      return;
+    }
+
+    // Sort our sideways dims by approximation cost so we can advance the lower cost ones first:
+    List<DocsAndCost> sidewaysDims = new ArrayList<>(dims.length);
+    sidewaysDims.addAll(List.of(dims));
+    CollectionUtil.timSort(sidewaysDims, APPROXIMATION_COMPARATOR);
+
+    // Maintain (optional) subset of sideways dims that support two-phase iteration, sorted by
+    // matchCost:
+    List<DocsAndCost> sidewaysTwoPhaseDims = null;
+    for (DocsAndCost dim : dims) {
+      if (dim.twoPhase != null) {
+        if (sidewaysTwoPhaseDims == null) {
+          sidewaysTwoPhaseDims = new ArrayList<>();
+        }
+        sidewaysTwoPhaseDims.add(dim);
+      }
+    }
+    if (sidewaysTwoPhaseDims != null) {
+      CollectionUtil.timSort(sidewaysTwoPhaseDims, TWO_PHASE_COMPARATOR);
+    }
+
+    // We keep track of a "runaway" dimension, which is a previously "near missed" dimension that
+    // has advanced beyond the docID the rest of the dimensions are positioned on. This functions
+    // a bit like the "head" queue in WANDScorer's "min should match" implementation. We use a
+    // single-valued PQ ordered by docID to easily determine the "closest" runaway dim we'll use
+    // for advancing in the case that multiple dim approximations miss.
+    PriorityQueue<DocsAndCost> runawayDim =
+        new PriorityQueue<>(1) {
+          @Override
+          protected boolean lessThan(DocsAndCost a, DocsAndCost b) {
+            return a.approximation.docID() < b.approximation.docID();
+          }
+        };
+
+    int docID = baseApproximation.docID();
+
+    nextDoc:
+    while (docID != DocIdSetIterator.NO_MORE_DOCS) {
+      assert docID == baseApproximation.docID();
+
+      if (acceptDocs != null && acceptDocs.get(docID) == false) {
+        docID = baseApproximation.nextDoc();
+        continue;
+      }
+
+      // If we carried a "runaway" over from the last iteration, see if we've "caught up" yet:
+      DocsAndCost runaway = runawayDim.top();
+      if (runaway != null && runaway.approximation.docID() <= docID) {
+        runawayDim.clear();
+        runaway = null;
+      }
+
+      // Check the sideways dim approximations. At most, one dim is allowed to miss for the doc
+      // to be a near-miss or full match. If multiple sideways dims miss, we move on:
+      for (DocsAndCost dim : sidewaysDims) {
+        int dimDocID = advanceIfBehind(docID, dim.approximation);
+        if (dimDocID != docID && dim != runaway) {
+          DocsAndCost evicted = runawayDim.insertWithOverflow(dim);
+          if (evicted != null) {
+            // More than one dim has advanced beyond docID, so we jump ahead to the "closer" of
+            // the two:
+            int next = evicted.approximation.docID();
+            docID = baseApproximation.advance(next);
+            continue nextDoc;
+          }
+        }
+      }
+
+      // At this point, we have an "approximate" near-miss or full match, but we still need
+      // to confirm two-phase iterators. First, check the base two-phase (it's always required):
+      if (baseTwoPhase != null && baseTwoPhase.matches() == false) {
+        docID = baseApproximation.nextDoc();
+        continue;
+      }
+
+      // If we have two-phase iterators for our sideways dims, check them now. At most, one
+      // sideways dim can miss for the doc to be a near-miss or full match. If more than one misses
+      // we move on:
+      DocsAndCost failedDim = runawayDim.top();
+      if (sidewaysTwoPhaseDims != null) {
        if (failedDim == null) {
-          for (DocsAndCost dim : twoPhaseDims) {
-            assert dim.approximation.docID() == baseApproximation.docID();
+          // If all sideways dims matched in their approximation phase, then we can allow one
+          // second-phase check to fail:
+          for (DocsAndCost dim : sidewaysTwoPhaseDims) {
+            assert dim.approximation.docID() == docID;
            if (dim.twoPhase.matches() == false) {
              if (failedDim != null) {
-                int next = Math.min(dim.approximation.nextDoc(), failedDim.approximation.nextDoc());
-                docID = baseApproximation.advance(next);
+                // Two second-phase checks have failed, so we move on:
+                docID = baseApproximation.nextDoc();
                continue nextDoc;
              } else {
                failedDim = dim;
@ -241,14 +304,14 @@ class DrillSidewaysScorer extends BulkScorer {
            }
          }
        } else {
-          for (DocsAndCost dim : twoPhaseDims) {
+          // If a sideways dim failed the approximate check, then no second-phase checks can fail:
+          for (DocsAndCost dim : sidewaysTwoPhaseDims) {
            if (failedDim == dim) {
              continue;
            }
-            assert dim.approximation.docID() == baseApproximation.docID();
+            assert dim.approximation.docID() == docID;
            if (dim.twoPhase.matches() == false) {
-              int next = Math.min(failedDim.approximation.docID(), dim.approximation.nextDoc());
-              docID = baseApproximation.advance(next);
+              docID = baseApproximation.nextDoc();
              continue nextDoc;
            }
          }
@ -258,9 +321,9 @@ class DrillSidewaysScorer extends BulkScorer {
      collectDocID = docID;
      if (failedDim == null) {
        // Hit passed all filters, so it's "real":
-        collectHit(collector, dims);
+        collectHit(collector, sidewaysDims);
      } else {
-        // Hit missed exactly one filter:
+        // Hit missed exactly one dim:
        collectNearMiss(failedDim.sidewaysLeafCollector);
      }

@ -268,6 +331,14 @@ class DrillSidewaysScorer extends BulkScorer {
    }
  }

+  private static int advanceIfBehind(int docID, DocIdSetIterator iterator) throws IOException {
+    if (iterator.docID() < docID) {
+      return iterator.advance(docID);
+    } else {
+      return iterator.docID();
+    }
+  }
+
  /** Used when drill downs are highly constraining vs baseQuery. */
  private void doDrillDownAdvanceScoring(
      Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims) throws IOException {
@ -651,6 +722,28 @@ class DrillSidewaysScorer extends BulkScorer {
    }
  }

+  private void collectHit(LeafCollector collector, DocsAndCost dim) throws IOException {
+    collector.collect(collectDocID);
+    if (drillDownCollector != null) {
+      drillDownLeafCollector.collect(collectDocID);
+    }
+
+    // Tally sideways count:
+    dim.sidewaysLeafCollector.collect(collectDocID);
+  }
+
+  private void collectHit(LeafCollector collector, List<DocsAndCost> dims) throws IOException {
+    collector.collect(collectDocID);
+    if (drillDownCollector != null) {
+      drillDownLeafCollector.collect(collectDocID);
+    }
+
+    // Tally sideways counts:
+    for (DocsAndCost dim : dims) {
+      dim.sidewaysLeafCollector.collect(collectDocID);
+    }
+  }
+
  private void collectNearMiss(LeafCollector sidewaysCollector) throws IOException {
    // if (DEBUG) {
    //  System.out.println("      missingDim=" + dim);
@ -689,8 +782,10 @@ class DrillSidewaysScorer extends BulkScorer {
  static class DocsAndCost {
    // approximation of matching docs, or the scorer itself
    final DocIdSetIterator approximation;
+    final long cost;
    // two-phase confirmation, or null if the approximation is accurate
    final TwoPhaseIterator twoPhase;
+    final float matchCost;
    final Collector sidewaysCollector;
    LeafCollector sidewaysLeafCollector;

@ -699,10 +794,13 @@ class DrillSidewaysScorer extends BulkScorer {
      if (twoPhase == null) {
        this.approximation = scorer.iterator();
        this.twoPhase = null;
+        this.matchCost = 0f;
      } else {
        this.approximation = twoPhase.approximation();
        this.twoPhase = twoPhase;
+        this.matchCost = twoPhase.matchCost();
      }
+      this.cost = approximation.cost();
      this.sidewaysCollector = sidewaysCollector;
    }
  }
--- a/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java
@ -297,36 +297,26 @@ public class TestDrillSideways extends FacetTestCase {
    // NRT open
    TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);

+    // Run all the basic test cases with a standard DrillSideways implementation:
    DrillSideways ds = getNewDrillSideways(searcher, config, taxoReader);
+    runDrillSidewaysTestCases(config, ds);

+    // Run all the basic test cases but make sure DS is set to score all sub-docs at once, so
+    // we exercise the doc-at-a-time scoring methodology:
+    ds = getNewDrillSidewaysScoreSubdocsAtOnce(searcher, config, taxoReader);
+    runDrillSidewaysTestCases(config, ds);
+
+    writer.close();
+    IOUtils.close(searcher.getIndexReader(), taxoReader, taxoWriter, dir, taxoDir);
+  }
+
+  private void runDrillSidewaysTestCases(FacetsConfig config, DrillSideways ds) throws Exception {
    //  case: drill-down on a single field; in this
    // case the drill-sideways + drill-down counts ==
    // drill-down of just the query:
    DrillDownQuery ddq = new DrillDownQuery(config);
    ddq.add("Author", "Lisa");
    DrillSidewaysResult r = ds.search(null, ddq, 10);
-    assertEquals(2, r.hits.totalHits.value);
-    // Publish Date is only drill-down, and Lisa published
-    // one in 2012 and one in 2010:
-    assertEquals(
-        "dim=Publish Date path=[] value=2 childCount=2\n  2010 (1)\n  2012 (1)\n",
-        r.facets.getTopChildren(10, "Publish Date").toString());
-
-    // Author is drill-sideways + drill-down: Lisa
-    // (drill-down) published twice, and Frank/Susan/Bob
-    // published once:
-    assertEquals(
-        "dim=Author path=[] value=5 childCount=4\n  Lisa (2)\n  Bob (1)\n  Susan (1)\n  Frank (1)\n",
-        r.facets.getTopChildren(10, "Author").toString());
-
-    // Same simple case, but no baseQuery (pure browse):
-    // drill-down on a single field; in this case the
-    // drill-sideways + drill-down counts == drill-down of
-    // just the query:
-    ddq = new DrillDownQuery(config);
-    ddq.add("Author", "Lisa");
-    r = ds.search(null, ddq, 10);
-
    assertEquals(2, r.hits.totalHits.value);
    // Publish Date is only drill-down, and Lisa published
    // one in 2012 and one in 2010:
@ -484,9 +474,6 @@ public class TestDrillSideways extends FacetTestCase {
        () -> {
          finalR.facets.getTopChildren(0, "Author");
        });
-
-    writer.close();
-    IOUtils.close(searcher.getIndexReader(), taxoReader, taxoWriter, dir, taxoDir);
  }

  public void testBasicWithCollectorManager() throws Exception {