LUCENE-7707: add explicit boolean to TopDocs.merge to govern whether incoming or implicit shard index should be used

2025-02-25 20:12:01 +00:00 · 2017-02-24 17:00:45 -05:00 · 2017-02-24 17:00:45 -05:00 · 2e56c0e505
commit 2e56c0e505
parent 471d842742
5 changed files with 62 additions and 64 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -97,6 +97,12 @@ API Changes

 * LUCENE-7702: Removed GraphQuery in favour of simple boolean query. (Matt Webber via Jim Ferenczi)

+* LUCENE-7707: TopDocs.merge now takes a boolean option telling it
+  when to use the incoming shard index versus when to assign the shard
+  index itself, allowing users to merge shard responses incrementally
+  instead of once all shard responses are present. (Simon Willnauer,
+  Mike McCandless)
+
 New Features

 * LUCENE-7449: Add CROSSES relation support to RangeFieldQuery. (Nick Knize)
@ -172,10 +178,6 @@ Improvements
  earlier than regular queries in order to improve cache efficiency.
  (Adrien Grand)

-* LUCENE-7707: Use predefined shard index when mergeing top docs if present. This 
-  allows to use TopDoc#merge to merge shard responses incrementally instead of
-  once all shard responses are present. (Simon Willnauer)
-
 Optimizations

 * LUCENE-7641: Optimized point range queries to compute documents that do not
--- a/lucene/core/src/java/org/apache/lucene/search/FieldDoc.java
+++ b/lucene/core/src/java/org/apache/lucene/search/FieldDoc.java
@ -52,18 +52,18 @@ public class FieldDoc extends ScoreDoc {

  /** Expert: Creates one of these objects with empty sort information. */
  public FieldDoc(int doc, float score) {
-    super (doc, score);
+    super(doc, score);
  }

  /** Expert: Creates one of these objects with the given sort information. */
  public FieldDoc(int doc, float score, Object[] fields) {
-    super (doc, score);
+    super(doc, score);
    this.fields = fields;
  }
  
  /** Expert: Creates one of these objects with the given sort information. */
  public FieldDoc(int doc, float score, Object[] fields, int shardIndex) {
-    super (doc, score, shardIndex);
+    super(doc, score, shardIndex);
    this.fields = fields;
  }
  
--- a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
+++ b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
@ -432,7 +432,7 @@ public class IndexSearcher {
        for (TopScoreDocCollector collector : collectors) {
          topDocs[i++] = collector.topDocs();
        }
-        return TopDocs.merge(cappedNumHits, topDocs);
+        return TopDocs.merge(0, cappedNumHits, topDocs, true);
      }

    };
@ -559,7 +559,7 @@ public class IndexSearcher {
        for (TopFieldCollector collector : collectors) {
          topDocs[i++] = collector.topDocs();
        }
-        return TopDocs.merge(sort, cappedNumHits, topDocs);
+        return TopDocs.merge(sort, 0, cappedNumHits, topDocs, true);
      }

    };
--- a/lucene/core/src/java/org/apache/lucene/search/TopDocs.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TopDocs.java
@ -16,7 +16,6 @@
 */
 package org.apache.lucene.search;

-
 import org.apache.lucene.util.PriorityQueue;

 /** Represents hits returned by {@link
@ -60,6 +59,8 @@ public class TopDocs {
  private final static class ShardRef {
    // Which shard (index into shardHits[]):
    final int shardIndex;
+
+    // True if we should use the incoming ScoreDoc.shardIndex for sort order
    final boolean useScoreDocIndex;

    // Which hit within the shard:
@ -77,10 +78,12 @@ public class TopDocs {

    int getShardIndex(ScoreDoc scoreDoc) {
      if (useScoreDocIndex) {
-        assert scoreDoc.shardIndex != -1 : "scoreDoc shardIndex must be predefined set but wasn't";
+        if (scoreDoc.shardIndex == -1) {
+          throw new IllegalArgumentException("setShardIndex is false but TopDocs[" + shardIndex + "].scoreDocs[" + hitIndex + "] is not set");
+        }
        return scoreDoc.shardIndex;
      } else {
-        assert scoreDoc.shardIndex == -1 : "scoreDoc shardIndex must be undefined but wasn't";
+        // NOTE: we don't assert that shardIndex is -1 here, because caller could in fact have set it but asked us to ignore it now
        return shardIndex;
      }
    }
@ -201,23 +204,25 @@ public class TopDocs {
   *  the provided TopDocs, sorting by score. Each {@link TopDocs}
   *  instance must be sorted.
   *
-   *  @see #merge(int, int, TopDocs[])
+   *  @see #merge(int, int, TopDocs[], boolean)
   *  @lucene.experimental */
  public static TopDocs merge(int topN, TopDocs[] shardHits) {
-    return merge(0, topN, shardHits);
+    return merge(0, topN, shardHits, true);
  }

  /**
   * Same as {@link #merge(int, TopDocs[])} but also ignores the top
   * {@code start} top docs. This is typically useful for pagination.
   *
-   * Note: This method will fill the {@link ScoreDoc#shardIndex} on all score docs returned iff all ScoreDocs passed
-   * to this have it's shard index set to <tt>-1</tt>. Otherwise the shard index is not set. This allows to predefine
-   * the shard index in order to incrementally merge shard responses without losing the original shard index.
+   * Note: If {@code setShardIndex} is true, this method will assume the incoming order of {@code shardHits} reflects
+   * each shard's index and will fill the {@link ScoreDoc#shardIndex}, otherwise
+   * it must already be set for all incoming {@code ScoreDoc}s, which can be useful when doing multiple reductions
+   * (merges) of TopDocs.
+   *
   * @lucene.experimental
   */
-  public static TopDocs merge(int start, int topN, TopDocs[] shardHits) {
-    return mergeAux(null, start, topN, shardHits);
+  public static TopDocs merge(int start, int topN, TopDocs[] shardHits, boolean setShardIndex) {
+    return mergeAux(null, start, topN, shardHits, setShardIndex);
  }

  /** Returns a new TopFieldDocs, containing topN results across
@ -226,31 +231,34 @@ public class TopDocs {
   *  the same Sort, and sort field values must have been
   *  filled (ie, <code>fillFields=true</code> must be
   *  passed to {@link TopFieldCollector#create}).
-   *  @see #merge(Sort, int, int, TopFieldDocs[])
+   *  @see #merge(Sort, int, int, TopFieldDocs[], boolean)
   * @lucene.experimental */
  public static TopFieldDocs merge(Sort sort, int topN, TopFieldDocs[] shardHits) {
-    return merge(sort, 0, topN, shardHits);
+    return merge(sort, 0, topN, shardHits, true);
  }

  /**
   * Same as {@link #merge(Sort, int, TopFieldDocs[])} but also ignores the top
   * {@code start} top docs. This is typically useful for pagination.
   *
-   * Note: This method will fill the {@link ScoreDoc#shardIndex} on all score docs returned iff all ScoreDocs passed
-   * to this have it's shard index set to <tt>-1</tt>. Otherwise the shard index is not set. This allows to predefine
-   * the shard index in order to incrementally merge shard responses without losing the original shard index.
+   * Note: If {@code setShardIndex} is true, this method will assume the incoming order of {@code shardHits} reflects
+   * each shard's index and will fill the {@link ScoreDoc#shardIndex}, otherwise
+   * it must already be set for all incoming {@code ScoreDoc}s, which can be useful when doing multiple reductions
+   * (merges) of TopDocs.
+   *
   * @lucene.experimental
   */
-  public static TopFieldDocs merge(Sort sort, int start, int topN, TopFieldDocs[] shardHits) {
+  public static TopFieldDocs merge(Sort sort, int start, int topN, TopFieldDocs[] shardHits, boolean setShardIndex) {
    if (sort == null) {
      throw new IllegalArgumentException("sort must be non-null when merging field-docs");
    }
-    return (TopFieldDocs) mergeAux(sort, start, topN, shardHits);
+    return (TopFieldDocs) mergeAux(sort, start, topN, shardHits, setShardIndex);
  }

  /** Auxiliary method used by the {@link #merge} impls. A sort value of null
   *  is used to indicate that docs should be sorted by score. */
-  private static TopDocs mergeAux(Sort sort, int start, int size, TopDocs[] shardHits) {
+  private static TopDocs mergeAux(Sort sort, int start, int size, TopDocs[] shardHits, boolean setShardIndex) {
+
    final PriorityQueue<ShardRef> queue;
    if (sort == null) {
      queue = new ScoreMergeSortQueue(shardHits);
@ -261,28 +269,15 @@ public class TopDocs {
    int totalHitCount = 0;
    int availHitCount = 0;
    float maxScore = Float.MIN_VALUE;
-    Boolean setShardIndex = null;
    for(int shardIDX=0;shardIDX<shardHits.length;shardIDX++) {
      final TopDocs shard = shardHits[shardIDX];
      // totalHits can be non-zero even if no hits were
      // collected, when searchAfter was used:
      totalHitCount += shard.totalHits;
      if (shard.scoreDocs != null && shard.scoreDocs.length > 0) {
-        if (shard.scoreDocs[0].shardIndex == -1) {
-          if (setShardIndex != null && setShardIndex == false) {
-            throw new IllegalStateException("scoreDocs at index " + shardIDX + " has undefined shard indices but previous scoreDocs were predefined");
-          }
-          setShardIndex = true;
-        } else {
-          if (setShardIndex != null && setShardIndex) {
-            throw new IllegalStateException("scoreDocs at index " + shardIDX + " has predefined shard indices but previous scoreDocs were undefined");
-          }
-          setShardIndex = false;
-        }
        availHitCount += shard.scoreDocs.length;
        queue.add(new ShardRef(shardIDX, setShardIndex == false));
        maxScore = Math.max(maxScore, shard.getMaxScore());
-        //System.out.println("  maxScore now " + maxScore + " vs " + shard.getMaxScore());
      }
    }

@ -303,19 +298,16 @@ public class TopDocs {
        ShardRef ref = queue.top();
        final ScoreDoc hit = shardHits[ref.shardIndex].scoreDocs[ref.hitIndex++];
        if (setShardIndex) {
-          // unless this index is already initialized potentially due to multiple merge phases, or explicitly by the user
-          // we set the shard index to the index of the TopDocs array this hit is coming from.
-          // this allows multiple merge phases if needed but requires extra accounting on the users end.
-          // at the same time this is fully backwards compatible since the value was initialize to -1 from the beginning
+          // caller asked us to record shardIndex (index of the TopDocs array) this hit is coming from:
          hit.shardIndex = ref.shardIndex;
+        } else if (hit.shardIndex == -1) {
+          throw new IllegalArgumentException("setShardIndex is false but TopDocs[" + ref.shardIndex + "].scoreDocs[" + (ref.hitIndex-1) + "] is not set");
        }
+          
        if (hitUpto >= start) {
          hits[hitUpto - start] = hit;
        }

-        //System.out.println("  hitUpto=" + hitUpto);
-        //System.out.println("    doc=" + hits[hitUpto].doc + " score=" + hits[hitUpto].score);
-
        hitUpto++;

        if (ref.hitIndex < shardHits[ref.shardIndex].scoreDocs.length) {
--- a/lucene/core/src/test/org/apache/lucene/search/TestTopDocsMerge.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestTopDocsMerge.java
@ -17,15 +17,22 @@
 package org.apache.lucene.search;


+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FloatDocValuesField;
 import org.apache.lucene.document.NumericDocValuesField;
 import org.apache.lucene.document.SortedDocValuesField;
-import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.CompositeReaderContext;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexReaderContext;
+import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.ReaderUtil;
 import org.apache.lucene.index.Term;
@ -35,13 +42,6 @@ import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.TestUtil;

-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
 public class TestTopDocsMerge extends LuceneTestCase {

  private static class ShardSearcher extends IndexSearcher {
@ -77,18 +77,18 @@ public class TestTopDocsMerge extends LuceneTestCase {

  public void testInconsistentTopDocsFail() {
    TopDocs[] topDocs = new TopDocs[] {
-        new TopDocs(1, new ScoreDoc[] { new ScoreDoc(1, 1.0f, 1) }),
+        new TopDocs(1, new ScoreDoc[] { new ScoreDoc(1, 1.0f) }),
        new TopDocs(1, new ScoreDoc[] { new ScoreDoc(1, 1.0f, -1) })
    };
    if (random().nextBoolean()) {
      ArrayUtil.swap(topDocs, 0, 1);
    }
-    expectThrows(IllegalStateException.class, () -> {
-      TopDocs.merge(0, 1, topDocs);
+    expectThrows(IllegalArgumentException.class, () -> {
+        TopDocs.merge(0, 1, topDocs, false);
    });
  }

-  public void testAssignShardIndex() {
+  public void testPreAssignedShardIndex() {
    boolean useConstantScore = random().nextBoolean();
    int numTopDocs = 2 + random().nextInt(10);
    ArrayList<TopDocs> topDocs = new ArrayList<>(numTopDocs);
@ -100,8 +100,8 @@ public class TestTopDocsMerge extends LuceneTestCase {
      ScoreDoc[] scoreDocs = new ScoreDoc[numHits];
      for (int j = 0; j < scoreDocs.length; j++) {
        float score = useConstantScore ? 1.0f : random().nextFloat();
-        scoreDocs[j] = new ScoreDoc((100 * i) + j, score , i);
        // we set the shard index to index in the list here but shuffle the entire list below
+        scoreDocs[j] = new ScoreDoc((100 * i) + j, score , i);
      }
      topDocs.add(new TopDocs(numHits, scoreDocs));
      shardResultMapping.put(i, topDocs.get(i));
@ -111,7 +111,11 @@ public class TestTopDocsMerge extends LuceneTestCase {
    Collections.shuffle(topDocs, random());
    final int from = random().nextInt(numHitsTotal-1);
    final int size = 1 + random().nextInt(numHitsTotal - from);
-    TopDocs merge = TopDocs.merge(from, size, topDocs.toArray(new TopDocs[0]));
+
+    // passing false here means TopDocs.merge uses the incoming ScoreDoc.shardIndex
+    // that we already set, instead of the position of that TopDocs in the array:
+    TopDocs merge = TopDocs.merge(from, size, topDocs.toArray(new TopDocs[0]), false);
+    
    assertTrue(merge.scoreDocs.length > 0);
    for (ScoreDoc scoreDoc : merge.scoreDocs) {
      assertTrue(scoreDoc.shardIndex != -1);
@ -129,7 +133,7 @@ public class TestTopDocsMerge extends LuceneTestCase {

    // now ensure merge is stable even if we use our own shard IDs
    Collections.shuffle(topDocs, random());
-    TopDocs merge2 = TopDocs.merge(from, size, topDocs.toArray(new TopDocs[0]));
+    TopDocs merge2 = TopDocs.merge(from, size, topDocs.toArray(new TopDocs[0]), false);
    assertArrayEquals(merge.scoreDocs, merge2.scoreDocs);
  }

@ -346,9 +350,9 @@ public class TestTopDocsMerge extends LuceneTestCase {
      final TopDocs mergedHits;
      if (useFrom) {
        if (sort == null) {
-          mergedHits = TopDocs.merge(from, size, shardHits);
+          mergedHits = TopDocs.merge(from, size, shardHits, true);
        } else {
-          mergedHits = TopDocs.merge(sort, from, size, (TopFieldDocs[]) shardHits);
+          mergedHits = TopDocs.merge(sort, from, size, (TopFieldDocs[]) shardHits, true);
        }
      } else {
        if (sort == null) {