Further optimize DrillSideways scoring (#11881)

This commit is contained in:
Greg Miller 2022-11-08 10:08:12 -08:00 committed by GitHub
parent f9c26ed501
commit c66a559050
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 178 additions and 90 deletions

View File

@ -160,6 +160,9 @@ Optimizations
* GITHUB#11876: Use ByteArrayComparator to speed up PointInSetQuery in single dimension case.
(Guo Feng)
* GITHUB#11881: Further optimize drill-sideways scoring by specializing the single dimension case
and borrowing some concepts from "min should match" scoring. (Greg Miller)
* GITHUB#11884: Simplify the logic of matchAll() in IndexSortSortedNumericDocValuesRangeQuery. (Lu Xugang)
Other

View File

@ -18,13 +18,11 @@ package org.apache.lucene.facet;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.search.BulkScorer;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.DocIdSetIterator;
@ -36,19 +34,15 @@ import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.PriorityQueue;
class DrillSidewaysScorer extends BulkScorer {
private static final Comparator<DocsAndCost> APPROXIMATION_COMPARATOR =
Comparator.comparingLong(e -> e.approximation.cost());
Comparator.comparingLong(e -> e.cost);
private static final Comparator<DocsAndCost> TWO_PHASE_COMPARATOR =
new Comparator<DocsAndCost>() {
@Override
public int compare(DocsAndCost o1, DocsAndCost o2) {
return Float.compare(o1.twoPhase.matchCost(), o2.twoPhase.matchCost());
}
};
(a, b) -> Float.compare(a.matchCost, b.matchCost);
// private static boolean DEBUG = false;
@ -167,34 +161,13 @@ class DrillSidewaysScorer extends BulkScorer {
}
/**
* Used when base query is highly constraining vs the drilldowns, or when the docs must be scored
* at once (i.e., like BooleanScorer2, not BooleanScorer). In this case we just .next() on base
* and .advance() on the dim filters.
* Query-first scoring specialization when there is only one drill-sideways dimension, which is
* likely a common scenario.
*/
private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims)
throws IOException {
setScorer(collector, ScoreCachingWrappingScorer.wrap(baseScorer));
List<DocsAndCost> allDims = Arrays.asList(dims);
CollectionUtil.timSort(allDims, APPROXIMATION_COMPARATOR);
List<DocsAndCost> twoPhaseDims = null;
for (DocsAndCost dim : dims) {
if (dim.twoPhase != null) {
if (twoPhaseDims == null) {
twoPhaseDims = new ArrayList<>(dims.length);
}
twoPhaseDims.add(dim);
}
}
if (twoPhaseDims != null) {
CollectionUtil.timSort(twoPhaseDims, TWO_PHASE_COMPARATOR);
}
private void doQueryFirstScoringSingleDim(
Bits acceptDocs, LeafCollector collector, DocsAndCost dim) throws IOException {
int docID = baseApproximation.docID();
nextDoc:
while (docID != PostingsEnum.NO_MORE_DOCS) {
while (docID != DocIdSetIterator.NO_MORE_DOCS) {
assert docID == baseApproximation.docID();
if (acceptDocs != null && acceptDocs.get(docID) == false) {
@ -202,38 +175,128 @@ class DrillSidewaysScorer extends BulkScorer {
continue;
}
DocsAndCost failedDim = null;
for (DocsAndCost dim : allDims) {
final int dimDocID;
if (dim.approximation.docID() < docID) {
dimDocID = dim.approximation.advance(docID);
} else {
dimDocID = dim.approximation.docID();
}
if (dimDocID != docID) {
if (failedDim != null) {
int next = Math.min(dimDocID, failedDim.approximation.docID());
docID = baseApproximation.advance(next);
continue nextDoc;
} else {
failedDim = dim;
}
}
}
if (baseTwoPhase != null && baseTwoPhase.matches() == false) {
docID = baseApproximation.nextDoc();
continue;
}
if (twoPhaseDims != null) {
// We have either a near-miss or full match. Check the sideways dim to see which it is:
collectDocID = docID;
if (advanceIfBehind(docID, dim.approximation) != docID
|| (dim.twoPhase != null && dim.twoPhase.matches() == false)) {
// The sideways dim missed, so we have a "near miss":
collectNearMiss(dim.sidewaysLeafCollector);
} else {
// Hit passed all filters, so it's "real":
collectHit(collector, dim);
}
docID = baseApproximation.nextDoc();
}
}
/**
* Used when base query is highly constraining vs the drilldowns, or when the docs must be scored
* at once (i.e., like BooleanScorer2, not BooleanScorer).
*/
private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims)
throws IOException {
setScorer(collector, ScoreCachingWrappingScorer.wrap(baseScorer));
// Specialize the single-dim use-case as we have a more efficient implementation for that:
if (dims.length == 1) {
doQueryFirstScoringSingleDim(acceptDocs, collector, dims[0]);
return;
}
// Sort our sideways dims by approximation cost so we can advance the lower cost ones first:
List<DocsAndCost> sidewaysDims = new ArrayList<>(dims.length);
sidewaysDims.addAll(List.of(dims));
CollectionUtil.timSort(sidewaysDims, APPROXIMATION_COMPARATOR);
// Maintain (optional) subset of sideways dims that support two-phase iteration, sorted by
// matchCost:
List<DocsAndCost> sidewaysTwoPhaseDims = null;
for (DocsAndCost dim : dims) {
if (dim.twoPhase != null) {
if (sidewaysTwoPhaseDims == null) {
sidewaysTwoPhaseDims = new ArrayList<>();
}
sidewaysTwoPhaseDims.add(dim);
}
}
if (sidewaysTwoPhaseDims != null) {
CollectionUtil.timSort(sidewaysTwoPhaseDims, TWO_PHASE_COMPARATOR);
}
// We keep track of a "runaway" dimension, which is a previously "near missed" dimension that
// has advanced beyond the docID the rest of the dimensions are positioned on. This functions
// a bit like the "head" queue in WANDScorer's "min should match" implementation. We use a
// single-valued PQ ordered by docID to easily determine the "closest" runaway dim we'll use
// for advancing in the case that multiple dim approximations miss.
PriorityQueue<DocsAndCost> runawayDim =
new PriorityQueue<>(1) {
@Override
protected boolean lessThan(DocsAndCost a, DocsAndCost b) {
return a.approximation.docID() < b.approximation.docID();
}
};
int docID = baseApproximation.docID();
nextDoc:
while (docID != DocIdSetIterator.NO_MORE_DOCS) {
assert docID == baseApproximation.docID();
if (acceptDocs != null && acceptDocs.get(docID) == false) {
docID = baseApproximation.nextDoc();
continue;
}
// If we carried a "runaway" over from the last iteration, see if we've "caught up" yet:
DocsAndCost runaway = runawayDim.top();
if (runaway != null && runaway.approximation.docID() <= docID) {
runawayDim.clear();
runaway = null;
}
// Check the sideways dim approximations. At most, one dim is allowed to miss for the doc
// to be a near-miss or full match. If multiple sideways dims miss, we move on:
for (DocsAndCost dim : sidewaysDims) {
int dimDocID = advanceIfBehind(docID, dim.approximation);
if (dimDocID != docID && dim != runaway) {
DocsAndCost evicted = runawayDim.insertWithOverflow(dim);
if (evicted != null) {
// More than one dim has advanced beyond docID, so we jump ahead to the "closer" of
// the two:
int next = evicted.approximation.docID();
docID = baseApproximation.advance(next);
continue nextDoc;
}
}
}
// At this point, we have an "approximate" near-miss or full match, but we still need
// to confirm two-phase iterators. First, check the base two-phase (it's always required):
if (baseTwoPhase != null && baseTwoPhase.matches() == false) {
docID = baseApproximation.nextDoc();
continue;
}
// If we have two-phase iterators for our sideways dims, check them now. At most, one
// sideways dim can miss for the doc to be a near-miss or full match. If more than one misses
// we move on:
DocsAndCost failedDim = runawayDim.top();
if (sidewaysTwoPhaseDims != null) {
if (failedDim == null) {
for (DocsAndCost dim : twoPhaseDims) {
assert dim.approximation.docID() == baseApproximation.docID();
// If all sideways dims matched in their approximation phase, then we can allow one
// second-phase check to fail:
for (DocsAndCost dim : sidewaysTwoPhaseDims) {
assert dim.approximation.docID() == docID;
if (dim.twoPhase.matches() == false) {
if (failedDim != null) {
int next = Math.min(dim.approximation.nextDoc(), failedDim.approximation.nextDoc());
docID = baseApproximation.advance(next);
// Two second-phase checks have failed, so we move on:
docID = baseApproximation.nextDoc();
continue nextDoc;
} else {
failedDim = dim;
@ -241,14 +304,14 @@ class DrillSidewaysScorer extends BulkScorer {
}
}
} else {
for (DocsAndCost dim : twoPhaseDims) {
// If a sideways dim failed the approximate check, then no second-phase checks can fail:
for (DocsAndCost dim : sidewaysTwoPhaseDims) {
if (failedDim == dim) {
continue;
}
assert dim.approximation.docID() == baseApproximation.docID();
assert dim.approximation.docID() == docID;
if (dim.twoPhase.matches() == false) {
int next = Math.min(failedDim.approximation.docID(), dim.approximation.nextDoc());
docID = baseApproximation.advance(next);
docID = baseApproximation.nextDoc();
continue nextDoc;
}
}
@ -258,9 +321,9 @@ class DrillSidewaysScorer extends BulkScorer {
collectDocID = docID;
if (failedDim == null) {
// Hit passed all filters, so it's "real":
collectHit(collector, dims);
collectHit(collector, sidewaysDims);
} else {
// Hit missed exactly one filter:
// Hit missed exactly one dim:
collectNearMiss(failedDim.sidewaysLeafCollector);
}
@ -268,6 +331,14 @@ class DrillSidewaysScorer extends BulkScorer {
}
}
private static int advanceIfBehind(int docID, DocIdSetIterator iterator) throws IOException {
if (iterator.docID() < docID) {
return iterator.advance(docID);
} else {
return iterator.docID();
}
}
/** Used when drill downs are highly constraining vs baseQuery. */
private void doDrillDownAdvanceScoring(
Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims) throws IOException {
@ -651,6 +722,28 @@ class DrillSidewaysScorer extends BulkScorer {
}
}
private void collectHit(LeafCollector collector, DocsAndCost dim) throws IOException {
collector.collect(collectDocID);
if (drillDownCollector != null) {
drillDownLeafCollector.collect(collectDocID);
}
// Tally sideways count:
dim.sidewaysLeafCollector.collect(collectDocID);
}
private void collectHit(LeafCollector collector, List<DocsAndCost> dims) throws IOException {
collector.collect(collectDocID);
if (drillDownCollector != null) {
drillDownLeafCollector.collect(collectDocID);
}
// Tally sideways counts:
for (DocsAndCost dim : dims) {
dim.sidewaysLeafCollector.collect(collectDocID);
}
}
private void collectNearMiss(LeafCollector sidewaysCollector) throws IOException {
// if (DEBUG) {
// System.out.println(" missingDim=" + dim);
@ -689,8 +782,10 @@ class DrillSidewaysScorer extends BulkScorer {
static class DocsAndCost {
// approximation of matching docs, or the scorer itself
final DocIdSetIterator approximation;
final long cost;
// two-phase confirmation, or null if the approximation is accurate
final TwoPhaseIterator twoPhase;
final float matchCost;
final Collector sidewaysCollector;
LeafCollector sidewaysLeafCollector;
@ -699,10 +794,13 @@ class DrillSidewaysScorer extends BulkScorer {
if (twoPhase == null) {
this.approximation = scorer.iterator();
this.twoPhase = null;
this.matchCost = 0f;
} else {
this.approximation = twoPhase.approximation();
this.twoPhase = twoPhase;
this.matchCost = twoPhase.matchCost();
}
this.cost = approximation.cost();
this.sidewaysCollector = sidewaysCollector;
}
}

View File

@ -297,36 +297,26 @@ public class TestDrillSideways extends FacetTestCase {
// NRT open
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
// Run all the basic test cases with a standard DrillSideways implementation:
DrillSideways ds = getNewDrillSideways(searcher, config, taxoReader);
runDrillSidewaysTestCases(config, ds);
// Run all the basic test cases but make sure DS is set to score all sub-docs at once, so
// we exercise the doc-at-a-time scoring methodology:
ds = getNewDrillSidewaysScoreSubdocsAtOnce(searcher, config, taxoReader);
runDrillSidewaysTestCases(config, ds);
writer.close();
IOUtils.close(searcher.getIndexReader(), taxoReader, taxoWriter, dir, taxoDir);
}
private void runDrillSidewaysTestCases(FacetsConfig config, DrillSideways ds) throws Exception {
// case: drill-down on a single field; in this
// case the drill-sideways + drill-down counts ==
// drill-down of just the query:
DrillDownQuery ddq = new DrillDownQuery(config);
ddq.add("Author", "Lisa");
DrillSidewaysResult r = ds.search(null, ddq, 10);
assertEquals(2, r.hits.totalHits.value);
// Publish Date is only drill-down, and Lisa published
// one in 2012 and one in 2010:
assertEquals(
"dim=Publish Date path=[] value=2 childCount=2\n 2010 (1)\n 2012 (1)\n",
r.facets.getTopChildren(10, "Publish Date").toString());
// Author is drill-sideways + drill-down: Lisa
// (drill-down) published twice, and Frank/Susan/Bob
// published once:
assertEquals(
"dim=Author path=[] value=5 childCount=4\n Lisa (2)\n Bob (1)\n Susan (1)\n Frank (1)\n",
r.facets.getTopChildren(10, "Author").toString());
// Same simple case, but no baseQuery (pure browse):
// drill-down on a single field; in this case the
// drill-sideways + drill-down counts == drill-down of
// just the query:
ddq = new DrillDownQuery(config);
ddq.add("Author", "Lisa");
r = ds.search(null, ddq, 10);
assertEquals(2, r.hits.totalHits.value);
// Publish Date is only drill-down, and Lisa published
// one in 2012 and one in 2010:
@ -484,9 +474,6 @@ public class TestDrillSideways extends FacetTestCase {
() -> {
finalR.facets.getTopChildren(0, "Author");
});
writer.close();
IOUtils.close(searcher.getIndexReader(), taxoReader, taxoWriter, dir, taxoDir);
}
public void testBasicWithCollectorManager() throws Exception {