Ensure DrillSidewaysScorer calls LeafCollector#finish on all sideways-dim FacetsCollectors (#12640)

2023-11-13 16:35:53 -08:00 · 2023-11-13 16:35:53 -08:00 · 117e8d3435
parent 1999353d9e
commit 117e8d3435
3 changed files with 106 additions and 26 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -180,7 +180,7 @@ New Features

 * GITHUB#12582: Add int8 scalar quantization to the HNSW vector format. This optionally allows for more compact lossy
  storage for the vectors, requiring about 75% memory for fast HNSW search. (Ben Trent)
-  
+
 * GITHUB#12660: HNSW graph now can be merged with multiple thread. Configurable in Lucene99HnswVectorsFormat.
  (Patrick Zhai)

@ -311,13 +311,16 @@ Bug Fixes

 * GITHUB#12770: Stop exploring HNSW graph if scores are not getting better. (Ben Trent)

+* GITHUB#12640: Ensure #finish is called on all drill-sideways collectors even if one throws a
+  CollectionTerminatedException (Greg Miller)
+
 Build
 ---------------------

 * GITHUB#12752: tests.multiplier could be omitted in test failure reproduce lines (esp. in
  nightly mode). (Dawid Weiss)

-* GITHUB#12742: JavaCompile tasks may be in up-to-date state when modular dependencies have changed 
+* GITHUB#12742: JavaCompile tasks may be in up-to-date state when modular dependencies have changed
  leading to odd runtime errors (Chris Hostetter, Dawid Weiss)

 * GITHUB#12612: Upgrade forbiddenapis to version 3.6 and ASM for APIJAR extraction to 9.6.  (Uwe Schindler)
--- a/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java
@ -18,7 +18,6 @@ package org.apache.lucene.facet;

 import java.io.IOException;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Comparator;
@ -145,22 +144,30 @@ class DrillSidewaysScorer extends BulkScorer {
    }
    */

-    if (scoreSubDocsAtOnce || baseQueryCost < drillDownCost / 10) {
-      // System.out.println("queryFirst: baseScorer=" + baseScorer + " disis.length=" + disis.length
-      // + " bits.length=" + bits.length);
-      // position base scorer to the first matching doc
-      baseApproximation.nextDoc();
-      doQueryFirstScoring(acceptDocs, collector, dims);
-    } else if (numDims > 1 && drillDownAdvancedCost < baseQueryCost / 10) {
-      // System.out.println("drillDownAdvance");
-      // position base scorer to the first matching doc
-      baseIterator.nextDoc();
-      doDrillDownAdvanceScoring(acceptDocs, collector, dims);
-    } else {
-      // System.out.println("union");
-      // position base scorer to the first matching doc
-      baseIterator.nextDoc();
-      doUnionScoring(acceptDocs, collector, dims);
+    try {
+      if (scoreSubDocsAtOnce || baseQueryCost < drillDownCost / 10) {
+        // System.out.println("queryFirst: baseScorer=" + baseScorer + " disis.length=" +
+        // disis.length
+        // + " bits.length=" + bits.length);
+        // position base scorer to the first matching doc
+        baseApproximation.nextDoc();
+        doQueryFirstScoring(acceptDocs, collector, dims);
+      } else if (numDims > 1 && drillDownAdvancedCost < baseQueryCost / 10) {
+        // System.out.println("drillDownAdvance");
+        // position base scorer to the first matching doc
+        baseIterator.nextDoc();
+        doDrillDownAdvanceScoring(acceptDocs, collector, dims);
+      } else {
+        // System.out.println("union");
+        // position base scorer to the first matching doc
+        baseIterator.nextDoc();
+        doUnionScoring(acceptDocs, collector, dims);
+      }
+    } finally {
+      // TODO: What's the right behavior when a collector throws CollectionTerminatedException?
+      // Should we stop scoring immediately (what we're doing now), or should we keep scoring until
+      // all collectors throw? Should users be able to specify somehow?
+      finish(dims);
    }

    return Integer.MAX_VALUE;
@ -199,7 +206,6 @@ class DrillSidewaysScorer extends BulkScorer {

      docID = baseApproximation.nextDoc();
    }
-    finish(Collections.singleton(dim));
  }

  /**
@ -337,8 +343,6 @@ class DrillSidewaysScorer extends BulkScorer {

      docID = baseApproximation.nextDoc();
    }
-
-    finish(sidewaysDims);
  }

  private static int advanceIfBehind(int docID, DocIdSetIterator iterator) throws IOException {
@ -557,7 +561,6 @@ class DrillSidewaysScorer extends BulkScorer {

      nextChunkStart += CHUNK;
    }
-    finish(Arrays.asList(dims));
  }

  private void doUnionScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims)
@ -712,8 +715,6 @@ class DrillSidewaysScorer extends BulkScorer {

      nextChunkStart += CHUNK;
    }
-
-    finish(Arrays.asList(dims));
  }

  private void collectHit(LeafCollector collector, DocsAndCost[] dims) throws IOException {
@ -765,7 +766,7 @@ class DrillSidewaysScorer extends BulkScorer {
    sidewaysCollector.collect(collectDocID);
  }

-  private void finish(Collection<DocsAndCost> dims) throws IOException {
+  private void finish(DocsAndCost[] dims) throws IOException {
    // Note: We _only_ call #finish on the facets collectors we're managing here, but not the
    // "main" collector. This is because IndexSearcher handles calling #finish on the main
    // collector.
--- a/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java
@ -55,6 +55,7 @@ import org.apache.lucene.index.Term;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.CollectionTerminatedException;
 import org.apache.lucene.search.Collector;
 import org.apache.lucene.search.CollectorManager;
 import org.apache.lucene.search.ConstantScoreScorer;
@ -257,6 +258,81 @@ public class TestDrillSideways extends FacetTestCase {
    IOUtils.close(searcher.getIndexReader(), taxoReader, taxoWriter, dir, taxoDir);
  }

+  public void testCollectionTerminated() throws Exception {
+    try (Directory dir = newDirectory();
+        RandomIndexWriter w = new RandomIndexWriter(random(), dir)) {
+      Document d = new Document();
+      d.add(new TextField("foo", "bar", Field.Store.NO));
+      w.addDocument(d);
+
+      d = new Document();
+      d.add(new TextField("foo", "bar", Field.Store.NO));
+      w.addDocument(d);
+
+      d = new Document();
+      d.add(new TextField("foo", "baz", Field.Store.NO));
+      w.addDocument(d);
+
+      try (IndexReader r = w.getReader()) {
+        IndexSearcher searcher = new IndexSearcher(r);
+        LeafReaderContext ctx = r.leaves().get(0);
+
+        Query baseQuery = new MatchAllDocsQuery();
+        Weight baseWeight = searcher.createWeight(baseQuery, ScoreMode.COMPLETE_NO_SCORES, 1f);
+        Scorer baseScorer = baseWeight.scorer(ctx);
+
+        Query dimQ = new TermQuery(new Term("foo", "bar"));
+        dimQ = searcher.rewrite(dimQ);
+        Weight dimWeight = searcher.createWeight(dimQ, ScoreMode.COMPLETE_NO_SCORES, 1f);
+        Scorer dimScorer = dimWeight.scorer(ctx);
+
+        FacetsCollector baseFC = new FacetsCollector();
+        FacetsCollector dimFC = new FacetsCollector();
+        DrillSidewaysScorer.DocsAndCost docsAndCost =
+            new DrillSidewaysScorer.DocsAndCost(dimScorer, dimFC);
+
+        LeafCollector baseCollector =
+            new LeafCollector() {
+              int counter = 0;
+
+              @Override
+              public void setScorer(Scorable scorer) throws IOException {
+                // doesn't matter
+              }
+
+              @Override
+              public void collect(int doc) throws IOException {
+                if (++counter > 1) {
+                  throw new CollectionTerminatedException();
+                }
+              }
+            };
+
+        boolean scoreSubDocsAtOnce = random().nextBoolean();
+        DrillSidewaysScorer scorer =
+            new DrillSidewaysScorer(
+                ctx,
+                baseScorer,
+                baseFC,
+                new DrillSidewaysScorer.DocsAndCost[] {docsAndCost},
+                scoreSubDocsAtOnce);
+        expectThrows(CollectionTerminatedException.class, () -> scorer.score(baseCollector, null));
+
+        // We've set things up so that our base collector with throw CollectionTerminatedException
+        // after collecting the first doc. This means we'll only collect the first indexed doc for
+        // both our base and sideways dim facets collectors. What we really want to test here is
+        // that the matching docs are still correctly present and populated after an early
+        // termination occurs (i.e., #finish is properly called in that scenario):
+        assertEquals(1, baseFC.getMatchingDocs().size());
+        assertEquals(1, dimFC.getMatchingDocs().size());
+        FacetsCollector.MatchingDocs baseMD = baseFC.getMatchingDocs().get(0);
+        FacetsCollector.MatchingDocs dimMD = dimFC.getMatchingDocs().get(0);
+        assertEquals(1, baseMD.totalHits);
+        assertEquals(1, dimMD.totalHits);
+      }
+    }
+  }
+
  public void testBasic() throws Exception {
    Directory dir = newDirectory();
    Directory taxoDir = newDirectory();