mirror of https://github.com/apache/lucene.git
Further optimize DrillSideways scoring (#11881)
This commit is contained in:
parent
f9c26ed501
commit
c66a559050
|
@ -160,6 +160,9 @@ Optimizations
|
|||
* GITHUB#11876: Use ByteArrayComparator to speed up PointInSetQuery in single dimension case.
|
||||
(Guo Feng)
|
||||
|
||||
* GITHUB#11881: Further optimize drill-sideways scoring by specializing the single dimension case
|
||||
and borrowing some concepts from "min should match" scoring. (Greg Miller)
|
||||
|
||||
* GITHUB#11884: Simplify the logic of matchAll() in IndexSortSortedNumericDocValuesRangeQuery. (Lu Xugang)
|
||||
|
||||
Other
|
||||
|
|
|
@ -18,13 +18,11 @@ package org.apache.lucene.facet;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.search.BulkScorer;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
|
@ -36,19 +34,15 @@ import org.apache.lucene.search.TwoPhaseIterator;
|
|||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.CollectionUtil;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
class DrillSidewaysScorer extends BulkScorer {
|
||||
|
||||
private static final Comparator<DocsAndCost> APPROXIMATION_COMPARATOR =
|
||||
Comparator.comparingLong(e -> e.approximation.cost());
|
||||
Comparator.comparingLong(e -> e.cost);
|
||||
|
||||
private static final Comparator<DocsAndCost> TWO_PHASE_COMPARATOR =
|
||||
new Comparator<DocsAndCost>() {
|
||||
@Override
|
||||
public int compare(DocsAndCost o1, DocsAndCost o2) {
|
||||
return Float.compare(o1.twoPhase.matchCost(), o2.twoPhase.matchCost());
|
||||
}
|
||||
};
|
||||
(a, b) -> Float.compare(a.matchCost, b.matchCost);
|
||||
|
||||
// private static boolean DEBUG = false;
|
||||
|
||||
|
@ -167,34 +161,13 @@ class DrillSidewaysScorer extends BulkScorer {
|
|||
}
|
||||
|
||||
/**
|
||||
* Used when base query is highly constraining vs the drilldowns, or when the docs must be scored
|
||||
* at once (i.e., like BooleanScorer2, not BooleanScorer). In this case we just .next() on base
|
||||
* and .advance() on the dim filters.
|
||||
* Query-first scoring specialization when there is only one drill-sideways dimension, which is
|
||||
* likely a common scenario.
|
||||
*/
|
||||
private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims)
|
||||
throws IOException {
|
||||
setScorer(collector, ScoreCachingWrappingScorer.wrap(baseScorer));
|
||||
|
||||
List<DocsAndCost> allDims = Arrays.asList(dims);
|
||||
CollectionUtil.timSort(allDims, APPROXIMATION_COMPARATOR);
|
||||
|
||||
List<DocsAndCost> twoPhaseDims = null;
|
||||
for (DocsAndCost dim : dims) {
|
||||
if (dim.twoPhase != null) {
|
||||
if (twoPhaseDims == null) {
|
||||
twoPhaseDims = new ArrayList<>(dims.length);
|
||||
}
|
||||
twoPhaseDims.add(dim);
|
||||
}
|
||||
}
|
||||
if (twoPhaseDims != null) {
|
||||
CollectionUtil.timSort(twoPhaseDims, TWO_PHASE_COMPARATOR);
|
||||
}
|
||||
|
||||
private void doQueryFirstScoringSingleDim(
|
||||
Bits acceptDocs, LeafCollector collector, DocsAndCost dim) throws IOException {
|
||||
int docID = baseApproximation.docID();
|
||||
|
||||
nextDoc:
|
||||
while (docID != PostingsEnum.NO_MORE_DOCS) {
|
||||
while (docID != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
assert docID == baseApproximation.docID();
|
||||
|
||||
if (acceptDocs != null && acceptDocs.get(docID) == false) {
|
||||
|
@ -202,38 +175,128 @@ class DrillSidewaysScorer extends BulkScorer {
|
|||
continue;
|
||||
}
|
||||
|
||||
DocsAndCost failedDim = null;
|
||||
for (DocsAndCost dim : allDims) {
|
||||
final int dimDocID;
|
||||
if (dim.approximation.docID() < docID) {
|
||||
dimDocID = dim.approximation.advance(docID);
|
||||
} else {
|
||||
dimDocID = dim.approximation.docID();
|
||||
}
|
||||
if (dimDocID != docID) {
|
||||
if (failedDim != null) {
|
||||
int next = Math.min(dimDocID, failedDim.approximation.docID());
|
||||
docID = baseApproximation.advance(next);
|
||||
continue nextDoc;
|
||||
} else {
|
||||
failedDim = dim;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (baseTwoPhase != null && baseTwoPhase.matches() == false) {
|
||||
docID = baseApproximation.nextDoc();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (twoPhaseDims != null) {
|
||||
// We have either a near-miss or full match. Check the sideways dim to see which it is:
|
||||
collectDocID = docID;
|
||||
if (advanceIfBehind(docID, dim.approximation) != docID
|
||||
|| (dim.twoPhase != null && dim.twoPhase.matches() == false)) {
|
||||
// The sideways dim missed, so we have a "near miss":
|
||||
collectNearMiss(dim.sidewaysLeafCollector);
|
||||
} else {
|
||||
// Hit passed all filters, so it's "real":
|
||||
collectHit(collector, dim);
|
||||
}
|
||||
|
||||
docID = baseApproximation.nextDoc();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Used when base query is highly constraining vs the drilldowns, or when the docs must be scored
|
||||
* at once (i.e., like BooleanScorer2, not BooleanScorer).
|
||||
*/
|
||||
private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims)
|
||||
throws IOException {
|
||||
setScorer(collector, ScoreCachingWrappingScorer.wrap(baseScorer));
|
||||
|
||||
// Specialize the single-dim use-case as we have a more efficient implementation for that:
|
||||
if (dims.length == 1) {
|
||||
doQueryFirstScoringSingleDim(acceptDocs, collector, dims[0]);
|
||||
return;
|
||||
}
|
||||
|
||||
// Sort our sideways dims by approximation cost so we can advance the lower cost ones first:
|
||||
List<DocsAndCost> sidewaysDims = new ArrayList<>(dims.length);
|
||||
sidewaysDims.addAll(List.of(dims));
|
||||
CollectionUtil.timSort(sidewaysDims, APPROXIMATION_COMPARATOR);
|
||||
|
||||
// Maintain (optional) subset of sideways dims that support two-phase iteration, sorted by
|
||||
// matchCost:
|
||||
List<DocsAndCost> sidewaysTwoPhaseDims = null;
|
||||
for (DocsAndCost dim : dims) {
|
||||
if (dim.twoPhase != null) {
|
||||
if (sidewaysTwoPhaseDims == null) {
|
||||
sidewaysTwoPhaseDims = new ArrayList<>();
|
||||
}
|
||||
sidewaysTwoPhaseDims.add(dim);
|
||||
}
|
||||
}
|
||||
if (sidewaysTwoPhaseDims != null) {
|
||||
CollectionUtil.timSort(sidewaysTwoPhaseDims, TWO_PHASE_COMPARATOR);
|
||||
}
|
||||
|
||||
// We keep track of a "runaway" dimension, which is a previously "near missed" dimension that
|
||||
// has advanced beyond the docID the rest of the dimensions are positioned on. This functions
|
||||
// a bit like the "head" queue in WANDScorer's "min should match" implementation. We use a
|
||||
// single-valued PQ ordered by docID to easily determine the "closest" runaway dim we'll use
|
||||
// for advancing in the case that multiple dim approximations miss.
|
||||
PriorityQueue<DocsAndCost> runawayDim =
|
||||
new PriorityQueue<>(1) {
|
||||
@Override
|
||||
protected boolean lessThan(DocsAndCost a, DocsAndCost b) {
|
||||
return a.approximation.docID() < b.approximation.docID();
|
||||
}
|
||||
};
|
||||
|
||||
int docID = baseApproximation.docID();
|
||||
|
||||
nextDoc:
|
||||
while (docID != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
assert docID == baseApproximation.docID();
|
||||
|
||||
if (acceptDocs != null && acceptDocs.get(docID) == false) {
|
||||
docID = baseApproximation.nextDoc();
|
||||
continue;
|
||||
}
|
||||
|
||||
// If we carried a "runaway" over from the last iteration, see if we've "caught up" yet:
|
||||
DocsAndCost runaway = runawayDim.top();
|
||||
if (runaway != null && runaway.approximation.docID() <= docID) {
|
||||
runawayDim.clear();
|
||||
runaway = null;
|
||||
}
|
||||
|
||||
// Check the sideways dim approximations. At most, one dim is allowed to miss for the doc
|
||||
// to be a near-miss or full match. If multiple sideways dims miss, we move on:
|
||||
for (DocsAndCost dim : sidewaysDims) {
|
||||
int dimDocID = advanceIfBehind(docID, dim.approximation);
|
||||
if (dimDocID != docID && dim != runaway) {
|
||||
DocsAndCost evicted = runawayDim.insertWithOverflow(dim);
|
||||
if (evicted != null) {
|
||||
// More than one dim has advanced beyond docID, so we jump ahead to the "closer" of
|
||||
// the two:
|
||||
int next = evicted.approximation.docID();
|
||||
docID = baseApproximation.advance(next);
|
||||
continue nextDoc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// At this point, we have an "approximate" near-miss or full match, but we still need
|
||||
// to confirm two-phase iterators. First, check the base two-phase (it's always required):
|
||||
if (baseTwoPhase != null && baseTwoPhase.matches() == false) {
|
||||
docID = baseApproximation.nextDoc();
|
||||
continue;
|
||||
}
|
||||
|
||||
// If we have two-phase iterators for our sideways dims, check them now. At most, one
|
||||
// sideways dim can miss for the doc to be a near-miss or full match. If more than one misses
|
||||
// we move on:
|
||||
DocsAndCost failedDim = runawayDim.top();
|
||||
if (sidewaysTwoPhaseDims != null) {
|
||||
if (failedDim == null) {
|
||||
for (DocsAndCost dim : twoPhaseDims) {
|
||||
assert dim.approximation.docID() == baseApproximation.docID();
|
||||
// If all sideways dims matched in their approximation phase, then we can allow one
|
||||
// second-phase check to fail:
|
||||
for (DocsAndCost dim : sidewaysTwoPhaseDims) {
|
||||
assert dim.approximation.docID() == docID;
|
||||
if (dim.twoPhase.matches() == false) {
|
||||
if (failedDim != null) {
|
||||
int next = Math.min(dim.approximation.nextDoc(), failedDim.approximation.nextDoc());
|
||||
docID = baseApproximation.advance(next);
|
||||
// Two second-phase checks have failed, so we move on:
|
||||
docID = baseApproximation.nextDoc();
|
||||
continue nextDoc;
|
||||
} else {
|
||||
failedDim = dim;
|
||||
|
@ -241,14 +304,14 @@ class DrillSidewaysScorer extends BulkScorer {
|
|||
}
|
||||
}
|
||||
} else {
|
||||
for (DocsAndCost dim : twoPhaseDims) {
|
||||
// If a sideways dim failed the approximate check, then no second-phase checks can fail:
|
||||
for (DocsAndCost dim : sidewaysTwoPhaseDims) {
|
||||
if (failedDim == dim) {
|
||||
continue;
|
||||
}
|
||||
assert dim.approximation.docID() == baseApproximation.docID();
|
||||
assert dim.approximation.docID() == docID;
|
||||
if (dim.twoPhase.matches() == false) {
|
||||
int next = Math.min(failedDim.approximation.docID(), dim.approximation.nextDoc());
|
||||
docID = baseApproximation.advance(next);
|
||||
docID = baseApproximation.nextDoc();
|
||||
continue nextDoc;
|
||||
}
|
||||
}
|
||||
|
@ -258,9 +321,9 @@ class DrillSidewaysScorer extends BulkScorer {
|
|||
collectDocID = docID;
|
||||
if (failedDim == null) {
|
||||
// Hit passed all filters, so it's "real":
|
||||
collectHit(collector, dims);
|
||||
collectHit(collector, sidewaysDims);
|
||||
} else {
|
||||
// Hit missed exactly one filter:
|
||||
// Hit missed exactly one dim:
|
||||
collectNearMiss(failedDim.sidewaysLeafCollector);
|
||||
}
|
||||
|
||||
|
@ -268,6 +331,14 @@ class DrillSidewaysScorer extends BulkScorer {
|
|||
}
|
||||
}
|
||||
|
||||
private static int advanceIfBehind(int docID, DocIdSetIterator iterator) throws IOException {
|
||||
if (iterator.docID() < docID) {
|
||||
return iterator.advance(docID);
|
||||
} else {
|
||||
return iterator.docID();
|
||||
}
|
||||
}
|
||||
|
||||
/** Used when drill downs are highly constraining vs baseQuery. */
|
||||
private void doDrillDownAdvanceScoring(
|
||||
Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims) throws IOException {
|
||||
|
@ -651,6 +722,28 @@ class DrillSidewaysScorer extends BulkScorer {
|
|||
}
|
||||
}
|
||||
|
||||
private void collectHit(LeafCollector collector, DocsAndCost dim) throws IOException {
|
||||
collector.collect(collectDocID);
|
||||
if (drillDownCollector != null) {
|
||||
drillDownLeafCollector.collect(collectDocID);
|
||||
}
|
||||
|
||||
// Tally sideways count:
|
||||
dim.sidewaysLeafCollector.collect(collectDocID);
|
||||
}
|
||||
|
||||
private void collectHit(LeafCollector collector, List<DocsAndCost> dims) throws IOException {
|
||||
collector.collect(collectDocID);
|
||||
if (drillDownCollector != null) {
|
||||
drillDownLeafCollector.collect(collectDocID);
|
||||
}
|
||||
|
||||
// Tally sideways counts:
|
||||
for (DocsAndCost dim : dims) {
|
||||
dim.sidewaysLeafCollector.collect(collectDocID);
|
||||
}
|
||||
}
|
||||
|
||||
private void collectNearMiss(LeafCollector sidewaysCollector) throws IOException {
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" missingDim=" + dim);
|
||||
|
@ -689,8 +782,10 @@ class DrillSidewaysScorer extends BulkScorer {
|
|||
static class DocsAndCost {
|
||||
// approximation of matching docs, or the scorer itself
|
||||
final DocIdSetIterator approximation;
|
||||
final long cost;
|
||||
// two-phase confirmation, or null if the approximation is accurate
|
||||
final TwoPhaseIterator twoPhase;
|
||||
final float matchCost;
|
||||
final Collector sidewaysCollector;
|
||||
LeafCollector sidewaysLeafCollector;
|
||||
|
||||
|
@ -699,10 +794,13 @@ class DrillSidewaysScorer extends BulkScorer {
|
|||
if (twoPhase == null) {
|
||||
this.approximation = scorer.iterator();
|
||||
this.twoPhase = null;
|
||||
this.matchCost = 0f;
|
||||
} else {
|
||||
this.approximation = twoPhase.approximation();
|
||||
this.twoPhase = twoPhase;
|
||||
this.matchCost = twoPhase.matchCost();
|
||||
}
|
||||
this.cost = approximation.cost();
|
||||
this.sidewaysCollector = sidewaysCollector;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -297,36 +297,26 @@ public class TestDrillSideways extends FacetTestCase {
|
|||
// NRT open
|
||||
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
|
||||
|
||||
// Run all the basic test cases with a standard DrillSideways implementation:
|
||||
DrillSideways ds = getNewDrillSideways(searcher, config, taxoReader);
|
||||
runDrillSidewaysTestCases(config, ds);
|
||||
|
||||
// Run all the basic test cases but make sure DS is set to score all sub-docs at once, so
|
||||
// we exercise the doc-at-a-time scoring methodology:
|
||||
ds = getNewDrillSidewaysScoreSubdocsAtOnce(searcher, config, taxoReader);
|
||||
runDrillSidewaysTestCases(config, ds);
|
||||
|
||||
writer.close();
|
||||
IOUtils.close(searcher.getIndexReader(), taxoReader, taxoWriter, dir, taxoDir);
|
||||
}
|
||||
|
||||
private void runDrillSidewaysTestCases(FacetsConfig config, DrillSideways ds) throws Exception {
|
||||
// case: drill-down on a single field; in this
|
||||
// case the drill-sideways + drill-down counts ==
|
||||
// drill-down of just the query:
|
||||
DrillDownQuery ddq = new DrillDownQuery(config);
|
||||
ddq.add("Author", "Lisa");
|
||||
DrillSidewaysResult r = ds.search(null, ddq, 10);
|
||||
assertEquals(2, r.hits.totalHits.value);
|
||||
// Publish Date is only drill-down, and Lisa published
|
||||
// one in 2012 and one in 2010:
|
||||
assertEquals(
|
||||
"dim=Publish Date path=[] value=2 childCount=2\n 2010 (1)\n 2012 (1)\n",
|
||||
r.facets.getTopChildren(10, "Publish Date").toString());
|
||||
|
||||
// Author is drill-sideways + drill-down: Lisa
|
||||
// (drill-down) published twice, and Frank/Susan/Bob
|
||||
// published once:
|
||||
assertEquals(
|
||||
"dim=Author path=[] value=5 childCount=4\n Lisa (2)\n Bob (1)\n Susan (1)\n Frank (1)\n",
|
||||
r.facets.getTopChildren(10, "Author").toString());
|
||||
|
||||
// Same simple case, but no baseQuery (pure browse):
|
||||
// drill-down on a single field; in this case the
|
||||
// drill-sideways + drill-down counts == drill-down of
|
||||
// just the query:
|
||||
ddq = new DrillDownQuery(config);
|
||||
ddq.add("Author", "Lisa");
|
||||
r = ds.search(null, ddq, 10);
|
||||
|
||||
assertEquals(2, r.hits.totalHits.value);
|
||||
// Publish Date is only drill-down, and Lisa published
|
||||
// one in 2012 and one in 2010:
|
||||
|
@ -484,9 +474,6 @@ public class TestDrillSideways extends FacetTestCase {
|
|||
() -> {
|
||||
finalR.facets.getTopChildren(0, "Author");
|
||||
});
|
||||
|
||||
writer.close();
|
||||
IOUtils.close(searcher.getIndexReader(), taxoReader, taxoWriter, dir, taxoDir);
|
||||
}
|
||||
|
||||
public void testBasicWithCollectorManager() throws Exception {
|
||||
|
|
Loading…
Reference in New Issue