DrillSideways optimizations (#11803)

DrillSidewaysScorer now breaks up first- and second-phase matching and makes use of advance when possible over nextDoc.
This commit is contained in:
Greg Miller 2022-09-29 05:22:30 -07:00 committed by GitHub
parent 6f25c79db3
commit d02ba3134f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 109 additions and 32 deletions

View File

@ -129,6 +129,9 @@ Optimizations
* GITHUB#11771: KeywordRepeatFilter + OpenNLPLemmatizer sometimes arbitrarily exits token stream.
(Luke Kot-Zaniewski)
* GITHUB#11797: DrillSidewaysScorer has improved to leverage "advance" instead of "next" where
possible, and splits out first and second phase checks to delay match confirmation. (Greg Miller)
Other
---------------------
* LUCENE-10423: Remove usages of System.currentTimeMillis() from tests. (Marios Trivyzas)

View File

@ -17,8 +17,12 @@
package org.apache.lucene.facet;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.search.BulkScorer;
@ -30,10 +34,22 @@ import org.apache.lucene.search.ScoreCachingWrappingScorer;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.util.FixedBitSet;
class DrillSidewaysScorer extends BulkScorer {
private static final Comparator<DocsAndCost> APPROXIMATION_COMPARATOR =
Comparator.comparingLong(e -> e.approximation.cost());
private static final Comparator<DocsAndCost> TWO_PHASE_COMPARATOR =
new Comparator<DocsAndCost>() {
@Override
public int compare(DocsAndCost o1, DocsAndCost o2) {
return Float.compare(o1.twoPhase.matchCost(), o2.twoPhase.matchCost());
}
};
// private static boolean DEBUG = false;
private final Collector drillDownCollector;
@ -44,6 +60,8 @@ class DrillSidewaysScorer extends BulkScorer {
// DrillDown DocsEnums:
private final Scorer baseScorer;
private final DocIdSetIterator baseIterator;
private final DocIdSetIterator baseApproximation;
private final TwoPhaseIterator baseTwoPhase;
private final LeafReaderContext context;
@ -65,6 +83,12 @@ class DrillSidewaysScorer extends BulkScorer {
this.context = context;
this.baseScorer = baseScorer;
this.baseIterator = baseScorer.iterator();
this.baseTwoPhase = baseScorer.twoPhaseIterator();
if (baseTwoPhase != null) {
this.baseApproximation = baseTwoPhase.approximation();
} else {
this.baseApproximation = baseIterator;
}
this.drillDownCollector = drillDownCollector;
this.scoreSubDocsAtOnce = scoreSubDocsAtOnce;
}
@ -149,60 +173,98 @@ class DrillSidewaysScorer extends BulkScorer {
*/
private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims)
throws IOException {
// if (DEBUG) {
// System.out.println(" doQueryFirstScoring");
// }
setScorer(collector, ScoreCachingWrappingScorer.wrap(baseScorer));
int docID = baseScorer.docID();
List<DocsAndCost> allDims = Arrays.asList(dims);
CollectionUtil.timSort(allDims, APPROXIMATION_COMPARATOR);
List<DocsAndCost> twoPhaseDims = null;
for (DocsAndCost dim : dims) {
if (dim.twoPhase != null) {
if (twoPhaseDims == null) {
twoPhaseDims = new ArrayList<>(dims.length);
}
twoPhaseDims.add(dim);
}
}
if (twoPhaseDims != null) {
CollectionUtil.timSort(twoPhaseDims, TWO_PHASE_COMPARATOR);
}
int docID = baseApproximation.docID();
nextDoc:
while (docID != PostingsEnum.NO_MORE_DOCS) {
assert docID == baseApproximation.docID();
if (acceptDocs != null && acceptDocs.get(docID) == false) {
docID = baseIterator.nextDoc();
docID = baseApproximation.nextDoc();
continue;
}
LeafCollector failedCollector = null;
for (DocsAndCost dim : dims) {
// TODO: should we sort this 2nd dimension of
// docsEnums from most frequent to least?
DocsAndCost failedDim = null;
for (DocsAndCost dim : allDims) {
final int dimDocID;
if (dim.approximation.docID() < docID) {
dim.approximation.advance(docID);
dimDocID = dim.approximation.advance(docID);
} else {
dimDocID = dim.approximation.docID();
}
boolean matches = false;
if (dim.approximation.docID() == docID) {
if (dim.twoPhase == null) {
matches = true;
} else {
matches = dim.twoPhase.matches();
}
}
if (matches == false) {
if (failedCollector != null) {
// More than one dim fails on this document, so
// it's neither a hit nor a near-miss; move to
// next doc:
docID = baseIterator.nextDoc();
if (dimDocID != docID) {
if (failedDim != null) {
int next = Math.min(dimDocID, failedDim.approximation.docID());
docID = baseApproximation.advance(next);
continue nextDoc;
} else {
failedCollector = dim.sidewaysLeafCollector;
failedDim = dim;
}
}
}
if (baseTwoPhase != null && baseTwoPhase.matches() == false) {
docID = baseApproximation.nextDoc();
continue;
}
if (twoPhaseDims != null) {
if (failedDim == null) {
for (DocsAndCost dim : twoPhaseDims) {
assert dim.approximation.docID() == baseApproximation.docID();
if (dim.twoPhase.matches() == false) {
if (failedDim != null) {
int next = Math.min(dim.approximation.nextDoc(), failedDim.approximation.nextDoc());
docID = baseApproximation.advance(next);
continue nextDoc;
} else {
failedDim = dim;
}
}
}
} else {
for (DocsAndCost dim : twoPhaseDims) {
if (failedDim == dim) {
continue;
}
assert dim.approximation.docID() == baseApproximation.docID();
if (dim.twoPhase.matches() == false) {
int next = Math.min(failedDim.approximation.docID(), dim.approximation.nextDoc());
docID = baseApproximation.advance(next);
continue nextDoc;
}
}
}
}
collectDocID = docID;
if (failedCollector == null) {
if (failedDim == null) {
// Hit passed all filters, so it's "real":
collectHit(collector, dims);
} else {
// Hit missed exactly one filter:
collectNearMiss(failedCollector);
collectNearMiss(failedDim.sidewaysLeafCollector);
}
docID = baseIterator.nextDoc();
docID = baseApproximation.nextDoc();
}
}

View File

@ -33,6 +33,7 @@ import java.util.stream.Collectors;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.facet.DrillSideways.DrillSidewaysResult;
import org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState;
@ -1051,6 +1052,7 @@ public class TestDrillSideways extends FacetTestCase {
doc.add(new FacetField("dim" + dim, dimValues[dim][dimValue]));
}
doc.add(new StringField("dim" + dim, dimValues[dim][dimValue], Field.Store.YES));
doc.add(new SortedSetDocValuesField("dim" + dim, new BytesRef(dimValues[dim][dimValue])));
if (VERBOSE) {
System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue]));
}
@ -1063,6 +1065,8 @@ public class TestDrillSideways extends FacetTestCase {
doc.add(new FacetField("dim" + dim, dimValues[dim][dimValue2]));
}
doc.add(new StringField("dim" + dim, dimValues[dim][dimValue2], Field.Store.YES));
doc.add(
new SortedSetDocValuesField("dim" + dim, new BytesRef(dimValues[dim][dimValue2])));
if (VERBOSE) {
System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue2]));
}
@ -1188,7 +1192,15 @@ public class TestDrillSideways extends FacetTestCase {
for (int dim = 0; dim < numDims; dim++) {
if (drillDowns[dim] != null) {
for (String value : drillDowns[dim]) {
ddq.add("dim" + dim, value);
// Sometimes use a "traditional" term query and sometimes use a two-phase approach to
// ensure code coverage:
if (random().nextBoolean()) {
ddq.add("dim" + dim, value);
} else {
ddq.add(
"dim" + dim,
SortedSetDocValuesField.newSlowExactQuery("dim" + dim, new BytesRef(value)));
}
}
}
}