DrillSideways optimizations (#11803)

DrillSidewaysScorer now breaks up first- and second-phase matching and makes use of advance when possible over nextDoc.
2022-09-29 05:22:30 -07:00 · 2022-09-29 05:22:30 -07:00 · d02ba3134f
parent 6f25c79db3
commit d02ba3134f
3 changed files with 109 additions and 32 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -129,6 +129,9 @@ Optimizations
 * GITHUB#11771: KeywordRepeatFilter + OpenNLPLemmatizer sometimes arbitrarily exits token stream.
  (Luke Kot-Zaniewski)

+* GITHUB#11797: DrillSidewaysScorer has improved to leverage "advance" instead of "next" where
+  possible, and splits out first and second phase checks to delay match confirmation. (Greg Miller)
+
 Other
 ---------------------
 * LUCENE-10423: Remove usages of System.currentTimeMillis() from tests. (Marios Trivyzas)
--- a/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java
@ -17,8 +17,12 @@
 package org.apache.lucene.facet;

 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.search.BulkScorer;
@ -30,10 +34,22 @@ import org.apache.lucene.search.ScoreCachingWrappingScorer;
 import org.apache.lucene.search.Scorer;
 import org.apache.lucene.search.TwoPhaseIterator;
 import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.CollectionUtil;
 import org.apache.lucene.util.FixedBitSet;

 class DrillSidewaysScorer extends BulkScorer {

+  private static final Comparator<DocsAndCost> APPROXIMATION_COMPARATOR =
+      Comparator.comparingLong(e -> e.approximation.cost());
+
+  private static final Comparator<DocsAndCost> TWO_PHASE_COMPARATOR =
+      new Comparator<DocsAndCost>() {
+        @Override
+        public int compare(DocsAndCost o1, DocsAndCost o2) {
+          return Float.compare(o1.twoPhase.matchCost(), o2.twoPhase.matchCost());
+        }
+      };
+
  // private static boolean DEBUG = false;

  private final Collector drillDownCollector;
@ -44,6 +60,8 @@ class DrillSidewaysScorer extends BulkScorer {
  // DrillDown DocsEnums:
  private final Scorer baseScorer;
  private final DocIdSetIterator baseIterator;
+  private final DocIdSetIterator baseApproximation;
+  private final TwoPhaseIterator baseTwoPhase;

  private final LeafReaderContext context;

@ -65,6 +83,12 @@ class DrillSidewaysScorer extends BulkScorer {
    this.context = context;
    this.baseScorer = baseScorer;
    this.baseIterator = baseScorer.iterator();
+    this.baseTwoPhase = baseScorer.twoPhaseIterator();
+    if (baseTwoPhase != null) {
+      this.baseApproximation = baseTwoPhase.approximation();
+    } else {
+      this.baseApproximation = baseIterator;
+    }
    this.drillDownCollector = drillDownCollector;
    this.scoreSubDocsAtOnce = scoreSubDocsAtOnce;
  }
@ -149,60 +173,98 @@ class DrillSidewaysScorer extends BulkScorer {
   */
  private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims)
      throws IOException {
-    // if (DEBUG) {
-    //  System.out.println("  doQueryFirstScoring");
-    // }
    setScorer(collector, ScoreCachingWrappingScorer.wrap(baseScorer));

-    int docID = baseScorer.docID();
+    List<DocsAndCost> allDims = Arrays.asList(dims);
+    CollectionUtil.timSort(allDims, APPROXIMATION_COMPARATOR);
+
+    List<DocsAndCost> twoPhaseDims = null;
+    for (DocsAndCost dim : dims) {
+      if (dim.twoPhase != null) {
+        if (twoPhaseDims == null) {
+          twoPhaseDims = new ArrayList<>(dims.length);
+        }
+        twoPhaseDims.add(dim);
+      }
+    }
+    if (twoPhaseDims != null) {
+      CollectionUtil.timSort(twoPhaseDims, TWO_PHASE_COMPARATOR);
+    }
+
+    int docID = baseApproximation.docID();

    nextDoc:
    while (docID != PostingsEnum.NO_MORE_DOCS) {
+      assert docID == baseApproximation.docID();
+
      if (acceptDocs != null && acceptDocs.get(docID) == false) {
-        docID = baseIterator.nextDoc();
+        docID = baseApproximation.nextDoc();
        continue;
      }
-      LeafCollector failedCollector = null;
-      for (DocsAndCost dim : dims) {
-        // TODO: should we sort this 2nd dimension of
-        // docsEnums from most frequent to least?
+
+      DocsAndCost failedDim = null;
+      for (DocsAndCost dim : allDims) {
+        final int dimDocID;
        if (dim.approximation.docID() < docID) {
-          dim.approximation.advance(docID);
+          dimDocID = dim.approximation.advance(docID);
+        } else {
+          dimDocID = dim.approximation.docID();
        }
-
-        boolean matches = false;
-        if (dim.approximation.docID() == docID) {
-          if (dim.twoPhase == null) {
-            matches = true;
-          } else {
-            matches = dim.twoPhase.matches();
-          }
-        }
-
-        if (matches == false) {
-          if (failedCollector != null) {
-            // More than one dim fails on this document, so
-            // it's neither a hit nor a near-miss; move to
-            // next doc:
-            docID = baseIterator.nextDoc();
+        if (dimDocID != docID) {
+          if (failedDim != null) {
+            int next = Math.min(dimDocID, failedDim.approximation.docID());
+            docID = baseApproximation.advance(next);
            continue nextDoc;
          } else {
-            failedCollector = dim.sidewaysLeafCollector;
+            failedDim = dim;
+          }
+        }
+      }
+
+      if (baseTwoPhase != null && baseTwoPhase.matches() == false) {
+        docID = baseApproximation.nextDoc();
+        continue;
+      }
+
+      if (twoPhaseDims != null) {
+        if (failedDim == null) {
+          for (DocsAndCost dim : twoPhaseDims) {
+            assert dim.approximation.docID() == baseApproximation.docID();
+            if (dim.twoPhase.matches() == false) {
+              if (failedDim != null) {
+                int next = Math.min(dim.approximation.nextDoc(), failedDim.approximation.nextDoc());
+                docID = baseApproximation.advance(next);
+                continue nextDoc;
+              } else {
+                failedDim = dim;
+              }
+            }
+          }
+        } else {
+          for (DocsAndCost dim : twoPhaseDims) {
+            if (failedDim == dim) {
+              continue;
+            }
+            assert dim.approximation.docID() == baseApproximation.docID();
+            if (dim.twoPhase.matches() == false) {
+              int next = Math.min(failedDim.approximation.docID(), dim.approximation.nextDoc());
+              docID = baseApproximation.advance(next);
+              continue nextDoc;
+            }
          }
        }
      }

      collectDocID = docID;
-
-      if (failedCollector == null) {
+      if (failedDim == null) {
        // Hit passed all filters, so it's "real":
        collectHit(collector, dims);
      } else {
        // Hit missed exactly one filter:
-        collectNearMiss(failedCollector);
+        collectNearMiss(failedDim.sidewaysLeafCollector);
      }

-      docID = baseIterator.nextDoc();
+      docID = baseApproximation.nextDoc();
    }
  }

--- a/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java
@ -33,6 +33,7 @@ import java.util.stream.Collectors;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.SortedDocValuesField;
+import org.apache.lucene.document.SortedSetDocValuesField;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.facet.DrillSideways.DrillSidewaysResult;
 import org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState;
@ -1051,6 +1052,7 @@ public class TestDrillSideways extends FacetTestCase {
            doc.add(new FacetField("dim" + dim, dimValues[dim][dimValue]));
          }
          doc.add(new StringField("dim" + dim, dimValues[dim][dimValue], Field.Store.YES));
+          doc.add(new SortedSetDocValuesField("dim" + dim, new BytesRef(dimValues[dim][dimValue])));
          if (VERBOSE) {
            System.out.println("    dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue]));
          }
@ -1063,6 +1065,8 @@ public class TestDrillSideways extends FacetTestCase {
            doc.add(new FacetField("dim" + dim, dimValues[dim][dimValue2]));
          }
          doc.add(new StringField("dim" + dim, dimValues[dim][dimValue2], Field.Store.YES));
+          doc.add(
+              new SortedSetDocValuesField("dim" + dim, new BytesRef(dimValues[dim][dimValue2])));
          if (VERBOSE) {
            System.out.println("      dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue2]));
          }
@ -1188,7 +1192,15 @@ public class TestDrillSideways extends FacetTestCase {
      for (int dim = 0; dim < numDims; dim++) {
        if (drillDowns[dim] != null) {
          for (String value : drillDowns[dim]) {
-            ddq.add("dim" + dim, value);
+            // Sometimes use a "traditional" term query and sometimes use a two-phase approach to
+            // ensure code coverage:
+            if (random().nextBoolean()) {
+              ddq.add("dim" + dim, value);
+            } else {
+              ddq.add(
+                  "dim" + dim,
+                  SortedSetDocValuesField.newSlowExactQuery("dim" + dim, new BytesRef(value)));
+            }
          }
        }
      }