mirror of https://github.com/apache/lucene.git
DrillSideways optimizations (#11803)
DrillSidewaysScorer now breaks up first- and second-phase matching and makes use of advance when possible over nextDoc.
This commit is contained in:
parent
6f25c79db3
commit
d02ba3134f
|
@ -129,6 +129,9 @@ Optimizations
|
||||||
* GITHUB#11771: KeywordRepeatFilter + OpenNLPLemmatizer sometimes arbitrarily exits token stream.
|
* GITHUB#11771: KeywordRepeatFilter + OpenNLPLemmatizer sometimes arbitrarily exits token stream.
|
||||||
(Luke Kot-Zaniewski)
|
(Luke Kot-Zaniewski)
|
||||||
|
|
||||||
|
* GITHUB#11797: DrillSidewaysScorer has improved to leverage "advance" instead of "next" where
|
||||||
|
possible, and splits out first and second phase checks to delay match confirmation. (Greg Miller)
|
||||||
|
|
||||||
Other
|
Other
|
||||||
---------------------
|
---------------------
|
||||||
* LUCENE-10423: Remove usages of System.currentTimeMillis() from tests. (Marios Trivyzas)
|
* LUCENE-10423: Remove usages of System.currentTimeMillis() from tests. (Marios Trivyzas)
|
||||||
|
|
|
@ -17,8 +17,12 @@
|
||||||
package org.apache.lucene.facet;
|
package org.apache.lucene.facet;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
import org.apache.lucene.index.LeafReaderContext;
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
import org.apache.lucene.index.PostingsEnum;
|
import org.apache.lucene.index.PostingsEnum;
|
||||||
import org.apache.lucene.search.BulkScorer;
|
import org.apache.lucene.search.BulkScorer;
|
||||||
|
@ -30,10 +34,22 @@ import org.apache.lucene.search.ScoreCachingWrappingScorer;
|
||||||
import org.apache.lucene.search.Scorer;
|
import org.apache.lucene.search.Scorer;
|
||||||
import org.apache.lucene.search.TwoPhaseIterator;
|
import org.apache.lucene.search.TwoPhaseIterator;
|
||||||
import org.apache.lucene.util.Bits;
|
import org.apache.lucene.util.Bits;
|
||||||
|
import org.apache.lucene.util.CollectionUtil;
|
||||||
import org.apache.lucene.util.FixedBitSet;
|
import org.apache.lucene.util.FixedBitSet;
|
||||||
|
|
||||||
class DrillSidewaysScorer extends BulkScorer {
|
class DrillSidewaysScorer extends BulkScorer {
|
||||||
|
|
||||||
|
private static final Comparator<DocsAndCost> APPROXIMATION_COMPARATOR =
|
||||||
|
Comparator.comparingLong(e -> e.approximation.cost());
|
||||||
|
|
||||||
|
private static final Comparator<DocsAndCost> TWO_PHASE_COMPARATOR =
|
||||||
|
new Comparator<DocsAndCost>() {
|
||||||
|
@Override
|
||||||
|
public int compare(DocsAndCost o1, DocsAndCost o2) {
|
||||||
|
return Float.compare(o1.twoPhase.matchCost(), o2.twoPhase.matchCost());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// private static boolean DEBUG = false;
|
// private static boolean DEBUG = false;
|
||||||
|
|
||||||
private final Collector drillDownCollector;
|
private final Collector drillDownCollector;
|
||||||
|
@ -44,6 +60,8 @@ class DrillSidewaysScorer extends BulkScorer {
|
||||||
// DrillDown DocsEnums:
|
// DrillDown DocsEnums:
|
||||||
private final Scorer baseScorer;
|
private final Scorer baseScorer;
|
||||||
private final DocIdSetIterator baseIterator;
|
private final DocIdSetIterator baseIterator;
|
||||||
|
private final DocIdSetIterator baseApproximation;
|
||||||
|
private final TwoPhaseIterator baseTwoPhase;
|
||||||
|
|
||||||
private final LeafReaderContext context;
|
private final LeafReaderContext context;
|
||||||
|
|
||||||
|
@ -65,6 +83,12 @@ class DrillSidewaysScorer extends BulkScorer {
|
||||||
this.context = context;
|
this.context = context;
|
||||||
this.baseScorer = baseScorer;
|
this.baseScorer = baseScorer;
|
||||||
this.baseIterator = baseScorer.iterator();
|
this.baseIterator = baseScorer.iterator();
|
||||||
|
this.baseTwoPhase = baseScorer.twoPhaseIterator();
|
||||||
|
if (baseTwoPhase != null) {
|
||||||
|
this.baseApproximation = baseTwoPhase.approximation();
|
||||||
|
} else {
|
||||||
|
this.baseApproximation = baseIterator;
|
||||||
|
}
|
||||||
this.drillDownCollector = drillDownCollector;
|
this.drillDownCollector = drillDownCollector;
|
||||||
this.scoreSubDocsAtOnce = scoreSubDocsAtOnce;
|
this.scoreSubDocsAtOnce = scoreSubDocsAtOnce;
|
||||||
}
|
}
|
||||||
|
@ -149,60 +173,98 @@ class DrillSidewaysScorer extends BulkScorer {
|
||||||
*/
|
*/
|
||||||
private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims)
|
private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
// if (DEBUG) {
|
|
||||||
// System.out.println(" doQueryFirstScoring");
|
|
||||||
// }
|
|
||||||
setScorer(collector, ScoreCachingWrappingScorer.wrap(baseScorer));
|
setScorer(collector, ScoreCachingWrappingScorer.wrap(baseScorer));
|
||||||
|
|
||||||
int docID = baseScorer.docID();
|
List<DocsAndCost> allDims = Arrays.asList(dims);
|
||||||
|
CollectionUtil.timSort(allDims, APPROXIMATION_COMPARATOR);
|
||||||
|
|
||||||
|
List<DocsAndCost> twoPhaseDims = null;
|
||||||
|
for (DocsAndCost dim : dims) {
|
||||||
|
if (dim.twoPhase != null) {
|
||||||
|
if (twoPhaseDims == null) {
|
||||||
|
twoPhaseDims = new ArrayList<>(dims.length);
|
||||||
|
}
|
||||||
|
twoPhaseDims.add(dim);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (twoPhaseDims != null) {
|
||||||
|
CollectionUtil.timSort(twoPhaseDims, TWO_PHASE_COMPARATOR);
|
||||||
|
}
|
||||||
|
|
||||||
|
int docID = baseApproximation.docID();
|
||||||
|
|
||||||
nextDoc:
|
nextDoc:
|
||||||
while (docID != PostingsEnum.NO_MORE_DOCS) {
|
while (docID != PostingsEnum.NO_MORE_DOCS) {
|
||||||
|
assert docID == baseApproximation.docID();
|
||||||
|
|
||||||
if (acceptDocs != null && acceptDocs.get(docID) == false) {
|
if (acceptDocs != null && acceptDocs.get(docID) == false) {
|
||||||
docID = baseIterator.nextDoc();
|
docID = baseApproximation.nextDoc();
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
LeafCollector failedCollector = null;
|
|
||||||
for (DocsAndCost dim : dims) {
|
DocsAndCost failedDim = null;
|
||||||
// TODO: should we sort this 2nd dimension of
|
for (DocsAndCost dim : allDims) {
|
||||||
// docsEnums from most frequent to least?
|
final int dimDocID;
|
||||||
if (dim.approximation.docID() < docID) {
|
if (dim.approximation.docID() < docID) {
|
||||||
dim.approximation.advance(docID);
|
dimDocID = dim.approximation.advance(docID);
|
||||||
|
} else {
|
||||||
|
dimDocID = dim.approximation.docID();
|
||||||
}
|
}
|
||||||
|
if (dimDocID != docID) {
|
||||||
boolean matches = false;
|
if (failedDim != null) {
|
||||||
if (dim.approximation.docID() == docID) {
|
int next = Math.min(dimDocID, failedDim.approximation.docID());
|
||||||
if (dim.twoPhase == null) {
|
docID = baseApproximation.advance(next);
|
||||||
matches = true;
|
|
||||||
} else {
|
|
||||||
matches = dim.twoPhase.matches();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (matches == false) {
|
|
||||||
if (failedCollector != null) {
|
|
||||||
// More than one dim fails on this document, so
|
|
||||||
// it's neither a hit nor a near-miss; move to
|
|
||||||
// next doc:
|
|
||||||
docID = baseIterator.nextDoc();
|
|
||||||
continue nextDoc;
|
continue nextDoc;
|
||||||
} else {
|
} else {
|
||||||
failedCollector = dim.sidewaysLeafCollector;
|
failedDim = dim;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (baseTwoPhase != null && baseTwoPhase.matches() == false) {
|
||||||
|
docID = baseApproximation.nextDoc();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (twoPhaseDims != null) {
|
||||||
|
if (failedDim == null) {
|
||||||
|
for (DocsAndCost dim : twoPhaseDims) {
|
||||||
|
assert dim.approximation.docID() == baseApproximation.docID();
|
||||||
|
if (dim.twoPhase.matches() == false) {
|
||||||
|
if (failedDim != null) {
|
||||||
|
int next = Math.min(dim.approximation.nextDoc(), failedDim.approximation.nextDoc());
|
||||||
|
docID = baseApproximation.advance(next);
|
||||||
|
continue nextDoc;
|
||||||
|
} else {
|
||||||
|
failedDim = dim;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (DocsAndCost dim : twoPhaseDims) {
|
||||||
|
if (failedDim == dim) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
assert dim.approximation.docID() == baseApproximation.docID();
|
||||||
|
if (dim.twoPhase.matches() == false) {
|
||||||
|
int next = Math.min(failedDim.approximation.docID(), dim.approximation.nextDoc());
|
||||||
|
docID = baseApproximation.advance(next);
|
||||||
|
continue nextDoc;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
collectDocID = docID;
|
collectDocID = docID;
|
||||||
|
if (failedDim == null) {
|
||||||
if (failedCollector == null) {
|
|
||||||
// Hit passed all filters, so it's "real":
|
// Hit passed all filters, so it's "real":
|
||||||
collectHit(collector, dims);
|
collectHit(collector, dims);
|
||||||
} else {
|
} else {
|
||||||
// Hit missed exactly one filter:
|
// Hit missed exactly one filter:
|
||||||
collectNearMiss(failedCollector);
|
collectNearMiss(failedDim.sidewaysLeafCollector);
|
||||||
}
|
}
|
||||||
|
|
||||||
docID = baseIterator.nextDoc();
|
docID = baseApproximation.nextDoc();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -33,6 +33,7 @@ import java.util.stream.Collectors;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.document.SortedDocValuesField;
|
import org.apache.lucene.document.SortedDocValuesField;
|
||||||
|
import org.apache.lucene.document.SortedSetDocValuesField;
|
||||||
import org.apache.lucene.document.StringField;
|
import org.apache.lucene.document.StringField;
|
||||||
import org.apache.lucene.facet.DrillSideways.DrillSidewaysResult;
|
import org.apache.lucene.facet.DrillSideways.DrillSidewaysResult;
|
||||||
import org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState;
|
import org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState;
|
||||||
|
@ -1051,6 +1052,7 @@ public class TestDrillSideways extends FacetTestCase {
|
||||||
doc.add(new FacetField("dim" + dim, dimValues[dim][dimValue]));
|
doc.add(new FacetField("dim" + dim, dimValues[dim][dimValue]));
|
||||||
}
|
}
|
||||||
doc.add(new StringField("dim" + dim, dimValues[dim][dimValue], Field.Store.YES));
|
doc.add(new StringField("dim" + dim, dimValues[dim][dimValue], Field.Store.YES));
|
||||||
|
doc.add(new SortedSetDocValuesField("dim" + dim, new BytesRef(dimValues[dim][dimValue])));
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue]));
|
System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue]));
|
||||||
}
|
}
|
||||||
|
@ -1063,6 +1065,8 @@ public class TestDrillSideways extends FacetTestCase {
|
||||||
doc.add(new FacetField("dim" + dim, dimValues[dim][dimValue2]));
|
doc.add(new FacetField("dim" + dim, dimValues[dim][dimValue2]));
|
||||||
}
|
}
|
||||||
doc.add(new StringField("dim" + dim, dimValues[dim][dimValue2], Field.Store.YES));
|
doc.add(new StringField("dim" + dim, dimValues[dim][dimValue2], Field.Store.YES));
|
||||||
|
doc.add(
|
||||||
|
new SortedSetDocValuesField("dim" + dim, new BytesRef(dimValues[dim][dimValue2])));
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue2]));
|
System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue2]));
|
||||||
}
|
}
|
||||||
|
@ -1188,7 +1192,15 @@ public class TestDrillSideways extends FacetTestCase {
|
||||||
for (int dim = 0; dim < numDims; dim++) {
|
for (int dim = 0; dim < numDims; dim++) {
|
||||||
if (drillDowns[dim] != null) {
|
if (drillDowns[dim] != null) {
|
||||||
for (String value : drillDowns[dim]) {
|
for (String value : drillDowns[dim]) {
|
||||||
ddq.add("dim" + dim, value);
|
// Sometimes use a "traditional" term query and sometimes use a two-phase approach to
|
||||||
|
// ensure code coverage:
|
||||||
|
if (random().nextBoolean()) {
|
||||||
|
ddq.add("dim" + dim, value);
|
||||||
|
} else {
|
||||||
|
ddq.add(
|
||||||
|
"dim" + dim,
|
||||||
|
SortedSetDocValuesField.newSlowExactQuery("dim" + dim, new BytesRef(value)));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue