DrillSideways optimizations (#11803)

DrillSidewaysScorer now breaks up first- and second-phase matching and makes use of advance when possible over nextDoc.
This commit is contained in:
Greg Miller 2022-09-29 05:22:30 -07:00 committed by GitHub
parent 6f25c79db3
commit d02ba3134f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 109 additions and 32 deletions

View File

@ -129,6 +129,9 @@ Optimizations
* GITHUB#11771: KeywordRepeatFilter + OpenNLPLemmatizer sometimes arbitrarily exits token stream. * GITHUB#11771: KeywordRepeatFilter + OpenNLPLemmatizer sometimes arbitrarily exits token stream.
(Luke Kot-Zaniewski) (Luke Kot-Zaniewski)
* GITHUB#11797: DrillSidewaysScorer has improved to leverage "advance" instead of "next" where
possible, and splits out first and second phase checks to delay match confirmation. (Greg Miller)
Other Other
--------------------- ---------------------
* LUCENE-10423: Remove usages of System.currentTimeMillis() from tests. (Marios Trivyzas) * LUCENE-10423: Remove usages of System.currentTimeMillis() from tests. (Marios Trivyzas)

View File

@ -17,8 +17,12 @@
package org.apache.lucene.facet; package org.apache.lucene.facet;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection; import java.util.Collection;
import java.util.Collections; import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.search.BulkScorer; import org.apache.lucene.search.BulkScorer;
@ -30,10 +34,22 @@ import org.apache.lucene.search.ScoreCachingWrappingScorer;
import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TwoPhaseIterator; import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.FixedBitSet;
class DrillSidewaysScorer extends BulkScorer { class DrillSidewaysScorer extends BulkScorer {
private static final Comparator<DocsAndCost> APPROXIMATION_COMPARATOR =
Comparator.comparingLong(e -> e.approximation.cost());
private static final Comparator<DocsAndCost> TWO_PHASE_COMPARATOR =
new Comparator<DocsAndCost>() {
@Override
public int compare(DocsAndCost o1, DocsAndCost o2) {
return Float.compare(o1.twoPhase.matchCost(), o2.twoPhase.matchCost());
}
};
// private static boolean DEBUG = false; // private static boolean DEBUG = false;
private final Collector drillDownCollector; private final Collector drillDownCollector;
@ -44,6 +60,8 @@ class DrillSidewaysScorer extends BulkScorer {
// DrillDown DocsEnums: // DrillDown DocsEnums:
private final Scorer baseScorer; private final Scorer baseScorer;
private final DocIdSetIterator baseIterator; private final DocIdSetIterator baseIterator;
private final DocIdSetIterator baseApproximation;
private final TwoPhaseIterator baseTwoPhase;
private final LeafReaderContext context; private final LeafReaderContext context;
@ -65,6 +83,12 @@ class DrillSidewaysScorer extends BulkScorer {
this.context = context; this.context = context;
this.baseScorer = baseScorer; this.baseScorer = baseScorer;
this.baseIterator = baseScorer.iterator(); this.baseIterator = baseScorer.iterator();
this.baseTwoPhase = baseScorer.twoPhaseIterator();
if (baseTwoPhase != null) {
this.baseApproximation = baseTwoPhase.approximation();
} else {
this.baseApproximation = baseIterator;
}
this.drillDownCollector = drillDownCollector; this.drillDownCollector = drillDownCollector;
this.scoreSubDocsAtOnce = scoreSubDocsAtOnce; this.scoreSubDocsAtOnce = scoreSubDocsAtOnce;
} }
@ -149,60 +173,98 @@ class DrillSidewaysScorer extends BulkScorer {
*/ */
private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims) private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims)
throws IOException { throws IOException {
// if (DEBUG) {
// System.out.println(" doQueryFirstScoring");
// }
setScorer(collector, ScoreCachingWrappingScorer.wrap(baseScorer)); setScorer(collector, ScoreCachingWrappingScorer.wrap(baseScorer));
int docID = baseScorer.docID(); List<DocsAndCost> allDims = Arrays.asList(dims);
CollectionUtil.timSort(allDims, APPROXIMATION_COMPARATOR);
List<DocsAndCost> twoPhaseDims = null;
for (DocsAndCost dim : dims) {
if (dim.twoPhase != null) {
if (twoPhaseDims == null) {
twoPhaseDims = new ArrayList<>(dims.length);
}
twoPhaseDims.add(dim);
}
}
if (twoPhaseDims != null) {
CollectionUtil.timSort(twoPhaseDims, TWO_PHASE_COMPARATOR);
}
int docID = baseApproximation.docID();
nextDoc: nextDoc:
while (docID != PostingsEnum.NO_MORE_DOCS) { while (docID != PostingsEnum.NO_MORE_DOCS) {
assert docID == baseApproximation.docID();
if (acceptDocs != null && acceptDocs.get(docID) == false) { if (acceptDocs != null && acceptDocs.get(docID) == false) {
docID = baseIterator.nextDoc(); docID = baseApproximation.nextDoc();
continue; continue;
} }
LeafCollector failedCollector = null;
for (DocsAndCost dim : dims) { DocsAndCost failedDim = null;
// TODO: should we sort this 2nd dimension of for (DocsAndCost dim : allDims) {
// docsEnums from most frequent to least? final int dimDocID;
if (dim.approximation.docID() < docID) { if (dim.approximation.docID() < docID) {
dim.approximation.advance(docID); dimDocID = dim.approximation.advance(docID);
} else {
dimDocID = dim.approximation.docID();
} }
if (dimDocID != docID) {
boolean matches = false; if (failedDim != null) {
if (dim.approximation.docID() == docID) { int next = Math.min(dimDocID, failedDim.approximation.docID());
if (dim.twoPhase == null) { docID = baseApproximation.advance(next);
matches = true;
} else {
matches = dim.twoPhase.matches();
}
}
if (matches == false) {
if (failedCollector != null) {
// More than one dim fails on this document, so
// it's neither a hit nor a near-miss; move to
// next doc:
docID = baseIterator.nextDoc();
continue nextDoc; continue nextDoc;
} else { } else {
failedCollector = dim.sidewaysLeafCollector; failedDim = dim;
}
}
}
if (baseTwoPhase != null && baseTwoPhase.matches() == false) {
docID = baseApproximation.nextDoc();
continue;
}
if (twoPhaseDims != null) {
if (failedDim == null) {
for (DocsAndCost dim : twoPhaseDims) {
assert dim.approximation.docID() == baseApproximation.docID();
if (dim.twoPhase.matches() == false) {
if (failedDim != null) {
int next = Math.min(dim.approximation.nextDoc(), failedDim.approximation.nextDoc());
docID = baseApproximation.advance(next);
continue nextDoc;
} else {
failedDim = dim;
}
}
}
} else {
for (DocsAndCost dim : twoPhaseDims) {
if (failedDim == dim) {
continue;
}
assert dim.approximation.docID() == baseApproximation.docID();
if (dim.twoPhase.matches() == false) {
int next = Math.min(failedDim.approximation.docID(), dim.approximation.nextDoc());
docID = baseApproximation.advance(next);
continue nextDoc;
}
} }
} }
} }
collectDocID = docID; collectDocID = docID;
if (failedDim == null) {
if (failedCollector == null) {
// Hit passed all filters, so it's "real": // Hit passed all filters, so it's "real":
collectHit(collector, dims); collectHit(collector, dims);
} else { } else {
// Hit missed exactly one filter: // Hit missed exactly one filter:
collectNearMiss(failedCollector); collectNearMiss(failedDim.sidewaysLeafCollector);
} }
docID = baseIterator.nextDoc(); docID = baseApproximation.nextDoc();
} }
} }

View File

@ -33,6 +33,7 @@ import java.util.stream.Collectors;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField; import org.apache.lucene.document.StringField;
import org.apache.lucene.facet.DrillSideways.DrillSidewaysResult; import org.apache.lucene.facet.DrillSideways.DrillSidewaysResult;
import org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState; import org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState;
@ -1051,6 +1052,7 @@ public class TestDrillSideways extends FacetTestCase {
doc.add(new FacetField("dim" + dim, dimValues[dim][dimValue])); doc.add(new FacetField("dim" + dim, dimValues[dim][dimValue]));
} }
doc.add(new StringField("dim" + dim, dimValues[dim][dimValue], Field.Store.YES)); doc.add(new StringField("dim" + dim, dimValues[dim][dimValue], Field.Store.YES));
doc.add(new SortedSetDocValuesField("dim" + dim, new BytesRef(dimValues[dim][dimValue])));
if (VERBOSE) { if (VERBOSE) {
System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue])); System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue]));
} }
@ -1063,6 +1065,8 @@ public class TestDrillSideways extends FacetTestCase {
doc.add(new FacetField("dim" + dim, dimValues[dim][dimValue2])); doc.add(new FacetField("dim" + dim, dimValues[dim][dimValue2]));
} }
doc.add(new StringField("dim" + dim, dimValues[dim][dimValue2], Field.Store.YES)); doc.add(new StringField("dim" + dim, dimValues[dim][dimValue2], Field.Store.YES));
doc.add(
new SortedSetDocValuesField("dim" + dim, new BytesRef(dimValues[dim][dimValue2])));
if (VERBOSE) { if (VERBOSE) {
System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue2])); System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue2]));
} }
@ -1188,7 +1192,15 @@ public class TestDrillSideways extends FacetTestCase {
for (int dim = 0; dim < numDims; dim++) { for (int dim = 0; dim < numDims; dim++) {
if (drillDowns[dim] != null) { if (drillDowns[dim] != null) {
for (String value : drillDowns[dim]) { for (String value : drillDowns[dim]) {
ddq.add("dim" + dim, value); // Sometimes use a "traditional" term query and sometimes use a two-phase approach to
// ensure code coverage:
if (random().nextBoolean()) {
ddq.add("dim" + dim, value);
} else {
ddq.add(
"dim" + dim,
SortedSetDocValuesField.newSlowExactQuery("dim" + dim, new BytesRef(value)));
}
} }
} }
} }