From d02ba3134ff58f578e31fad739095929db82db8a Mon Sep 17 00:00:00 2001 From: Greg Miller Date: Thu, 29 Sep 2022 05:22:30 -0700 Subject: [PATCH] DrillSideways optimizations (#11803) DrillSidewaysScorer now breaks up first- and second-phase matching and makes use of advance when possible over nextDoc. --- lucene/CHANGES.txt | 3 + .../lucene/facet/DrillSidewaysScorer.java | 124 +++++++++++++----- .../lucene/facet/TestDrillSideways.java | 14 +- 3 files changed, 109 insertions(+), 32 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 8abaecf3d92..80fb2047f55 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -129,6 +129,9 @@ Optimizations * GITHUB#11771: KeywordRepeatFilter + OpenNLPLemmatizer sometimes arbitrarily exits token stream. (Luke Kot-Zaniewski) +* GITHUB#11797: DrillSidewaysScorer has improved to leverage "advance" instead of "next" where + possible, and splits out first and second phase checks to delay match confirmation. (Greg Miller) + Other --------------------- * LUCENE-10423: Remove usages of System.currentTimeMillis() from tests. (Marios Trivyzas) diff --git a/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java b/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java index bf006f36a4a..360cb0bf74a 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java @@ -17,8 +17,12 @@ package org.apache.lucene.facet; import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.Collections; +import java.util.Comparator; +import java.util.List; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.search.BulkScorer; @@ -30,10 +34,22 @@ import org.apache.lucene.search.ScoreCachingWrappingScorer; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.TwoPhaseIterator; import org.apache.lucene.util.Bits; +import org.apache.lucene.util.CollectionUtil; import org.apache.lucene.util.FixedBitSet; class DrillSidewaysScorer extends BulkScorer { + private static final Comparator APPROXIMATION_COMPARATOR = + Comparator.comparingLong(e -> e.approximation.cost()); + + private static final Comparator TWO_PHASE_COMPARATOR = + new Comparator() { + @Override + public int compare(DocsAndCost o1, DocsAndCost o2) { + return Float.compare(o1.twoPhase.matchCost(), o2.twoPhase.matchCost()); + } + }; + // private static boolean DEBUG = false; private final Collector drillDownCollector; @@ -44,6 +60,8 @@ class DrillSidewaysScorer extends BulkScorer { // DrillDown DocsEnums: private final Scorer baseScorer; private final DocIdSetIterator baseIterator; + private final DocIdSetIterator baseApproximation; + private final TwoPhaseIterator baseTwoPhase; private final LeafReaderContext context; @@ -65,6 +83,12 @@ class DrillSidewaysScorer extends BulkScorer { this.context = context; this.baseScorer = baseScorer; this.baseIterator = baseScorer.iterator(); + this.baseTwoPhase = baseScorer.twoPhaseIterator(); + if (baseTwoPhase != null) { + this.baseApproximation = baseTwoPhase.approximation(); + } else { + this.baseApproximation = baseIterator; + } this.drillDownCollector = drillDownCollector; this.scoreSubDocsAtOnce = scoreSubDocsAtOnce; } @@ -149,60 +173,98 @@ class DrillSidewaysScorer extends BulkScorer { */ private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims) throws IOException { - // if (DEBUG) { - // System.out.println(" doQueryFirstScoring"); - // } setScorer(collector, ScoreCachingWrappingScorer.wrap(baseScorer)); - int docID = baseScorer.docID(); + List allDims = Arrays.asList(dims); + CollectionUtil.timSort(allDims, APPROXIMATION_COMPARATOR); + + List twoPhaseDims = null; + for (DocsAndCost dim : dims) { + if (dim.twoPhase != null) { + if (twoPhaseDims == null) { + twoPhaseDims = new ArrayList<>(dims.length); + } + twoPhaseDims.add(dim); + } + } + if (twoPhaseDims != null) { + CollectionUtil.timSort(twoPhaseDims, TWO_PHASE_COMPARATOR); + } + + int docID = baseApproximation.docID(); nextDoc: while (docID != PostingsEnum.NO_MORE_DOCS) { + assert docID == baseApproximation.docID(); + if (acceptDocs != null && acceptDocs.get(docID) == false) { - docID = baseIterator.nextDoc(); + docID = baseApproximation.nextDoc(); continue; } - LeafCollector failedCollector = null; - for (DocsAndCost dim : dims) { - // TODO: should we sort this 2nd dimension of - // docsEnums from most frequent to least? + + DocsAndCost failedDim = null; + for (DocsAndCost dim : allDims) { + final int dimDocID; if (dim.approximation.docID() < docID) { - dim.approximation.advance(docID); + dimDocID = dim.approximation.advance(docID); + } else { + dimDocID = dim.approximation.docID(); } - - boolean matches = false; - if (dim.approximation.docID() == docID) { - if (dim.twoPhase == null) { - matches = true; - } else { - matches = dim.twoPhase.matches(); - } - } - - if (matches == false) { - if (failedCollector != null) { - // More than one dim fails on this document, so - // it's neither a hit nor a near-miss; move to - // next doc: - docID = baseIterator.nextDoc(); + if (dimDocID != docID) { + if (failedDim != null) { + int next = Math.min(dimDocID, failedDim.approximation.docID()); + docID = baseApproximation.advance(next); continue nextDoc; } else { - failedCollector = dim.sidewaysLeafCollector; + failedDim = dim; + } + } + } + + if (baseTwoPhase != null && baseTwoPhase.matches() == false) { + docID = baseApproximation.nextDoc(); + continue; + } + + if (twoPhaseDims != null) { + if (failedDim == null) { + for (DocsAndCost dim : twoPhaseDims) { + assert dim.approximation.docID() == baseApproximation.docID(); + if (dim.twoPhase.matches() == false) { + if (failedDim != null) { + int next = Math.min(dim.approximation.nextDoc(), failedDim.approximation.nextDoc()); + docID = baseApproximation.advance(next); + continue nextDoc; + } else { + failedDim = dim; + } + } + } + } else { + for (DocsAndCost dim : twoPhaseDims) { + if (failedDim == dim) { + continue; + } + assert dim.approximation.docID() == baseApproximation.docID(); + if (dim.twoPhase.matches() == false) { + int next = Math.min(failedDim.approximation.docID(), dim.approximation.nextDoc()); + docID = baseApproximation.advance(next); + continue nextDoc; + } } } } collectDocID = docID; - - if (failedCollector == null) { + if (failedDim == null) { // Hit passed all filters, so it's "real": collectHit(collector, dims); } else { // Hit missed exactly one filter: - collectNearMiss(failedCollector); + collectNearMiss(failedDim.sidewaysLeafCollector); } - docID = baseIterator.nextDoc(); + docID = baseApproximation.nextDoc(); } } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java b/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java index 7274bfe7a45..36382041a5c 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java @@ -33,6 +33,7 @@ import java.util.stream.Collectors; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.document.StringField; import org.apache.lucene.facet.DrillSideways.DrillSidewaysResult; import org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState; @@ -1051,6 +1052,7 @@ public class TestDrillSideways extends FacetTestCase { doc.add(new FacetField("dim" + dim, dimValues[dim][dimValue])); } doc.add(new StringField("dim" + dim, dimValues[dim][dimValue], Field.Store.YES)); + doc.add(new SortedSetDocValuesField("dim" + dim, new BytesRef(dimValues[dim][dimValue]))); if (VERBOSE) { System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue])); } @@ -1063,6 +1065,8 @@ public class TestDrillSideways extends FacetTestCase { doc.add(new FacetField("dim" + dim, dimValues[dim][dimValue2])); } doc.add(new StringField("dim" + dim, dimValues[dim][dimValue2], Field.Store.YES)); + doc.add( + new SortedSetDocValuesField("dim" + dim, new BytesRef(dimValues[dim][dimValue2]))); if (VERBOSE) { System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue2])); } @@ -1188,7 +1192,15 @@ public class TestDrillSideways extends FacetTestCase { for (int dim = 0; dim < numDims; dim++) { if (drillDowns[dim] != null) { for (String value : drillDowns[dim]) { - ddq.add("dim" + dim, value); + // Sometimes use a "traditional" term query and sometimes use a two-phase approach to + // ensure code coverage: + if (random().nextBoolean()) { + ddq.add("dim" + dim, value); + } else { + ddq.add( + "dim" + dim, + SortedSetDocValuesField.newSlowExactQuery("dim" + dim, new BytesRef(value))); + } } } }