Ensure DrillSidewaysScorer calls LeafCollector#finish on all sideways-dim FacetsCollectors (#12640)

This commit is contained in:
Greg Miller 2023-11-13 16:35:53 -08:00 committed by GitHub
parent 1999353d9e
commit 117e8d3435
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 106 additions and 26 deletions

View File

@ -180,7 +180,7 @@ New Features
* GITHUB#12582: Add int8 scalar quantization to the HNSW vector format. This optionally allows for more compact lossy
storage for the vectors, requiring about 75% memory for fast HNSW search. (Ben Trent)
* GITHUB#12660: HNSW graph now can be merged with multiple thread. Configurable in Lucene99HnswVectorsFormat.
(Patrick Zhai)
@ -311,13 +311,16 @@ Bug Fixes
* GITHUB#12770: Stop exploring HNSW graph if scores are not getting better. (Ben Trent)
* GITHUB#12640: Ensure #finish is called on all drill-sideways collectors even if one throws a
CollectionTerminatedException (Greg Miller)
Build
---------------------
* GITHUB#12752: tests.multiplier could be omitted in test failure reproduce lines (esp. in
nightly mode). (Dawid Weiss)
* GITHUB#12742: JavaCompile tasks may be in up-to-date state when modular dependencies have changed
* GITHUB#12742: JavaCompile tasks may be in up-to-date state when modular dependencies have changed
leading to odd runtime errors (Chris Hostetter, Dawid Weiss)
* GITHUB#12612: Upgrade forbiddenapis to version 3.6 and ASM for APIJAR extraction to 9.6. (Uwe Schindler)

View File

@ -18,7 +18,6 @@ package org.apache.lucene.facet;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
@ -145,22 +144,30 @@ class DrillSidewaysScorer extends BulkScorer {
}
*/
if (scoreSubDocsAtOnce || baseQueryCost < drillDownCost / 10) {
// System.out.println("queryFirst: baseScorer=" + baseScorer + " disis.length=" + disis.length
// + " bits.length=" + bits.length);
// position base scorer to the first matching doc
baseApproximation.nextDoc();
doQueryFirstScoring(acceptDocs, collector, dims);
} else if (numDims > 1 && drillDownAdvancedCost < baseQueryCost / 10) {
// System.out.println("drillDownAdvance");
// position base scorer to the first matching doc
baseIterator.nextDoc();
doDrillDownAdvanceScoring(acceptDocs, collector, dims);
} else {
// System.out.println("union");
// position base scorer to the first matching doc
baseIterator.nextDoc();
doUnionScoring(acceptDocs, collector, dims);
try {
if (scoreSubDocsAtOnce || baseQueryCost < drillDownCost / 10) {
// System.out.println("queryFirst: baseScorer=" + baseScorer + " disis.length=" +
// disis.length
// + " bits.length=" + bits.length);
// position base scorer to the first matching doc
baseApproximation.nextDoc();
doQueryFirstScoring(acceptDocs, collector, dims);
} else if (numDims > 1 && drillDownAdvancedCost < baseQueryCost / 10) {
// System.out.println("drillDownAdvance");
// position base scorer to the first matching doc
baseIterator.nextDoc();
doDrillDownAdvanceScoring(acceptDocs, collector, dims);
} else {
// System.out.println("union");
// position base scorer to the first matching doc
baseIterator.nextDoc();
doUnionScoring(acceptDocs, collector, dims);
}
} finally {
// TODO: What's the right behavior when a collector throws CollectionTerminatedException?
// Should we stop scoring immediately (what we're doing now), or should we keep scoring until
// all collectors throw? Should users be able to specify somehow?
finish(dims);
}
return Integer.MAX_VALUE;
@ -199,7 +206,6 @@ class DrillSidewaysScorer extends BulkScorer {
docID = baseApproximation.nextDoc();
}
finish(Collections.singleton(dim));
}
/**
@ -337,8 +343,6 @@ class DrillSidewaysScorer extends BulkScorer {
docID = baseApproximation.nextDoc();
}
finish(sidewaysDims);
}
private static int advanceIfBehind(int docID, DocIdSetIterator iterator) throws IOException {
@ -557,7 +561,6 @@ class DrillSidewaysScorer extends BulkScorer {
nextChunkStart += CHUNK;
}
finish(Arrays.asList(dims));
}
private void doUnionScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims)
@ -712,8 +715,6 @@ class DrillSidewaysScorer extends BulkScorer {
nextChunkStart += CHUNK;
}
finish(Arrays.asList(dims));
}
private void collectHit(LeafCollector collector, DocsAndCost[] dims) throws IOException {
@ -765,7 +766,7 @@ class DrillSidewaysScorer extends BulkScorer {
sidewaysCollector.collect(collectDocID);
}
private void finish(Collection<DocsAndCost> dims) throws IOException {
private void finish(DocsAndCost[] dims) throws IOException {
// Note: We _only_ call #finish on the facets collectors we're managing here, but not the
// "main" collector. This is because IndexSearcher handles calling #finish on the main
// collector.

View File

@ -55,6 +55,7 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.CollectionTerminatedException;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.CollectorManager;
import org.apache.lucene.search.ConstantScoreScorer;
@ -257,6 +258,81 @@ public class TestDrillSideways extends FacetTestCase {
IOUtils.close(searcher.getIndexReader(), taxoReader, taxoWriter, dir, taxoDir);
}
public void testCollectionTerminated() throws Exception {
try (Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir)) {
Document d = new Document();
d.add(new TextField("foo", "bar", Field.Store.NO));
w.addDocument(d);
d = new Document();
d.add(new TextField("foo", "bar", Field.Store.NO));
w.addDocument(d);
d = new Document();
d.add(new TextField("foo", "baz", Field.Store.NO));
w.addDocument(d);
try (IndexReader r = w.getReader()) {
IndexSearcher searcher = new IndexSearcher(r);
LeafReaderContext ctx = r.leaves().get(0);
Query baseQuery = new MatchAllDocsQuery();
Weight baseWeight = searcher.createWeight(baseQuery, ScoreMode.COMPLETE_NO_SCORES, 1f);
Scorer baseScorer = baseWeight.scorer(ctx);
Query dimQ = new TermQuery(new Term("foo", "bar"));
dimQ = searcher.rewrite(dimQ);
Weight dimWeight = searcher.createWeight(dimQ, ScoreMode.COMPLETE_NO_SCORES, 1f);
Scorer dimScorer = dimWeight.scorer(ctx);
FacetsCollector baseFC = new FacetsCollector();
FacetsCollector dimFC = new FacetsCollector();
DrillSidewaysScorer.DocsAndCost docsAndCost =
new DrillSidewaysScorer.DocsAndCost(dimScorer, dimFC);
LeafCollector baseCollector =
new LeafCollector() {
int counter = 0;
@Override
public void setScorer(Scorable scorer) throws IOException {
// doesn't matter
}
@Override
public void collect(int doc) throws IOException {
if (++counter > 1) {
throw new CollectionTerminatedException();
}
}
};
boolean scoreSubDocsAtOnce = random().nextBoolean();
DrillSidewaysScorer scorer =
new DrillSidewaysScorer(
ctx,
baseScorer,
baseFC,
new DrillSidewaysScorer.DocsAndCost[] {docsAndCost},
scoreSubDocsAtOnce);
expectThrows(CollectionTerminatedException.class, () -> scorer.score(baseCollector, null));
// We've set things up so that our base collector with throw CollectionTerminatedException
// after collecting the first doc. This means we'll only collect the first indexed doc for
// both our base and sideways dim facets collectors. What we really want to test here is
// that the matching docs are still correctly present and populated after an early
// termination occurs (i.e., #finish is properly called in that scenario):
assertEquals(1, baseFC.getMatchingDocs().size());
assertEquals(1, dimFC.getMatchingDocs().size());
FacetsCollector.MatchingDocs baseMD = baseFC.getMatchingDocs().get(0);
FacetsCollector.MatchingDocs dimMD = dimFC.getMatchingDocs().get(0);
assertEquals(1, baseMD.totalHits);
assertEquals(1, dimMD.totalHits);
}
}
}
public void testBasic() throws Exception {
Directory dir = newDirectory();
Directory taxoDir = newDirectory();