DrillSideways optimizations (#11803)

DrillSidewaysScorer now breaks up first- and second-phase matching and makes use of advance when possible over nextDoc.
2022-09-29 05:22:30 -07:00 · 2022-09-29 05:22:30 -07:00 · d02ba3134f
parent 6f25c79db3
commit d02ba3134f
3 changed files with 109 additions and 32 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -129,6 +129,9 @@ Optimizations
 * GITHUB#11771: KeywordRepeatFilter + OpenNLPLemmatizer sometimes arbitrarily exits token stream.
  (Luke Kot-Zaniewski)
 * GITHUB#11797: DrillSidewaysScorer has improved to leverage "advance" instead of "next" where
  possible, and splits out first and second phase checks to delay match confirmation. (Greg Miller)
 Other
 ---------------------
 * LUCENE-10423: Remove usages of System.currentTimeMillis() from tests. (Marios Trivyzas)
--- a/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java
@ -17,8 +17,12 @@
 package org.apache.lucene.facet;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.List;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.search.BulkScorer;
@ -30,10 +34,22 @@ import org.apache.lucene.search.ScoreCachingWrappingScorer;
 import org.apache.lucene.search.Scorer;
 import org.apache.lucene.search.TwoPhaseIterator;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.CollectionUtil;
 import org.apache.lucene.util.FixedBitSet;
 class DrillSidewaysScorer extends BulkScorer {
  private static final Comparator<DocsAndCost> APPROXIMATION_COMPARATOR =
      Comparator.comparingLong(e -> e.approximation.cost());
  private static final Comparator<DocsAndCost> TWO_PHASE_COMPARATOR =
      new Comparator<DocsAndCost>() {
        @Override
        public int compare(DocsAndCost o1, DocsAndCost o2) {
          return Float.compare(o1.twoPhase.matchCost(), o2.twoPhase.matchCost());
        }
      };
  // private static boolean DEBUG = false;
  private final Collector drillDownCollector;
@ -44,6 +60,8 @@ class DrillSidewaysScorer extends BulkScorer {
  // DrillDown DocsEnums:
  private final Scorer baseScorer;
  private final DocIdSetIterator baseIterator;
  private final DocIdSetIterator baseApproximation;
  private final TwoPhaseIterator baseTwoPhase;
  private final LeafReaderContext context;
@ -65,6 +83,12 @@ class DrillSidewaysScorer extends BulkScorer {
    this.context = context;
    this.baseScorer = baseScorer;
    this.baseIterator = baseScorer.iterator();
    this.baseTwoPhase = baseScorer.twoPhaseIterator();
    if (baseTwoPhase != null) {
      this.baseApproximation = baseTwoPhase.approximation();
    } else {
      this.baseApproximation = baseIterator;
    }
    this.drillDownCollector = drillDownCollector;
    this.scoreSubDocsAtOnce = scoreSubDocsAtOnce;
  }
@ -149,60 +173,98 @@ class DrillSidewaysScorer extends BulkScorer {
   */
  private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims)
      throws IOException {
    // if (DEBUG) {
    //  System.out.println("  doQueryFirstScoring");
    // }
    setScorer(collector, ScoreCachingWrappingScorer.wrap(baseScorer));
-    int docID = baseScorer.docID();
+    List<DocsAndCost> allDims = Arrays.asList(dims);
    CollectionUtil.timSort(allDims, APPROXIMATION_COMPARATOR);
    List<DocsAndCost> twoPhaseDims = null;
    for (DocsAndCost dim : dims) {
      if (dim.twoPhase != null) {
        if (twoPhaseDims == null) {
          twoPhaseDims = new ArrayList<>(dims.length);
        }
        twoPhaseDims.add(dim);
      }
    }
    if (twoPhaseDims != null) {
      CollectionUtil.timSort(twoPhaseDims, TWO_PHASE_COMPARATOR);
    }
    int docID = baseApproximation.docID();
    nextDoc:
    while (docID != PostingsEnum.NO_MORE_DOCS) {
      assert docID == baseApproximation.docID();
      if (acceptDocs != null && acceptDocs.get(docID) == false) {
-        docID = baseIterator.nextDoc();
+        docID = baseApproximation.nextDoc();
        continue;
      }
-      LeafCollector failedCollector = null;
+
-      for (DocsAndCost dim : dims) {
+      DocsAndCost failedDim = null;
-        // TODO: should we sort this 2nd dimension of
+      for (DocsAndCost dim : allDims) {
-        // docsEnums from most frequent to least?
+        final int dimDocID;
        if (dim.approximation.docID() < docID) {
-          dim.approximation.advance(docID);
+          dimDocID = dim.approximation.advance(docID);
        } else {
          dimDocID = dim.approximation.docID();
        }
-
+        if (dimDocID != docID) {
-        boolean matches = false;
+          if (failedDim != null) {
-        if (dim.approximation.docID() == docID) {
+            int next = Math.min(dimDocID, failedDim.approximation.docID());
-          if (dim.twoPhase == null) {
+            docID = baseApproximation.advance(next);
            matches = true;
          } else {
            matches = dim.twoPhase.matches();
          }
        }
        if (matches == false) {
          if (failedCollector != null) {
            // More than one dim fails on this document, so
            // it's neither a hit nor a near-miss; move to
            // next doc:
            docID = baseIterator.nextDoc();
            continue nextDoc;
          } else {
-            failedCollector = dim.sidewaysLeafCollector;
+            failedDim = dim;
          }
        }
      }
      if (baseTwoPhase != null && baseTwoPhase.matches() == false) {
        docID = baseApproximation.nextDoc();
        continue;
      }
      if (twoPhaseDims != null) {
        if (failedDim == null) {
          for (DocsAndCost dim : twoPhaseDims) {
            assert dim.approximation.docID() == baseApproximation.docID();
            if (dim.twoPhase.matches() == false) {
              if (failedDim != null) {
                int next = Math.min(dim.approximation.nextDoc(), failedDim.approximation.nextDoc());
                docID = baseApproximation.advance(next);
                continue nextDoc;
              } else {
                failedDim = dim;
              }
            }
          }
        } else {
          for (DocsAndCost dim : twoPhaseDims) {
            if (failedDim == dim) {
              continue;
            }
            assert dim.approximation.docID() == baseApproximation.docID();
            if (dim.twoPhase.matches() == false) {
              int next = Math.min(failedDim.approximation.docID(), dim.approximation.nextDoc());
              docID = baseApproximation.advance(next);
              continue nextDoc;
            }
          }
        }
      }
      collectDocID = docID;
-
+      if (failedDim == null) {
      if (failedCollector == null) {
        // Hit passed all filters, so it's "real":
        collectHit(collector, dims);
      } else {
        // Hit missed exactly one filter:
-        collectNearMiss(failedCollector);
+        collectNearMiss(failedDim.sidewaysLeafCollector);
      }
-      docID = baseIterator.nextDoc();
+      docID = baseApproximation.nextDoc();
    }
  }
--- a/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java
@ -33,6 +33,7 @@ import java.util.stream.Collectors;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.SortedDocValuesField;
 import org.apache.lucene.document.SortedSetDocValuesField;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.facet.DrillSideways.DrillSidewaysResult;
 import org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState;
@ -1051,6 +1052,7 @@ public class TestDrillSideways extends FacetTestCase {
            doc.add(new FacetField("dim" + dim, dimValues[dim][dimValue]));
          }
          doc.add(new StringField("dim" + dim, dimValues[dim][dimValue], Field.Store.YES));
          doc.add(new SortedSetDocValuesField("dim" + dim, new BytesRef(dimValues[dim][dimValue])));
          if (VERBOSE) {
            System.out.println("    dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue]));
          }
@ -1063,6 +1065,8 @@ public class TestDrillSideways extends FacetTestCase {
            doc.add(new FacetField("dim" + dim, dimValues[dim][dimValue2]));
          }
          doc.add(new StringField("dim" + dim, dimValues[dim][dimValue2], Field.Store.YES));
          doc.add(
              new SortedSetDocValuesField("dim" + dim, new BytesRef(dimValues[dim][dimValue2])));
          if (VERBOSE) {
            System.out.println("      dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue2]));
          }
@ -1188,7 +1192,15 @@ public class TestDrillSideways extends FacetTestCase {
      for (int dim = 0; dim < numDims; dim++) {
        if (drillDowns[dim] != null) {
          for (String value : drillDowns[dim]) {
-            ddq.add("dim" + dim, value);
+            // Sometimes use a "traditional" term query and sometimes use a two-phase approach to
            // ensure code coverage:
            if (random().nextBoolean()) {
              ddq.add("dim" + dim, value);
            } else {
              ddq.add(
                  "dim" + dim,
                  SortedSetDocValuesField.newSlowExactQuery("dim" + dim, new BytesRef(value)));
            }
          }
        }
      }