mirror of https://github.com/apache/lucene.git
LUCENE-7707: add explicit boolean to TopDocs.merge to govern whether incoming or implicit shard index should be used
This commit is contained in:
parent
471d842742
commit
2e56c0e505
|
@ -97,6 +97,12 @@ API Changes
|
|||
|
||||
* LUCENE-7702: Removed GraphQuery in favour of simple boolean query. (Matt Webber via Jim Ferenczi)
|
||||
|
||||
* LUCENE-7707: TopDocs.merge now takes a boolean option telling it
|
||||
when to use the incoming shard index versus when to assign the shard
|
||||
index itself, allowing users to merge shard responses incrementally
|
||||
instead of once all shard responses are present. (Simon Willnauer,
|
||||
Mike McCandless)
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-7449: Add CROSSES relation support to RangeFieldQuery. (Nick Knize)
|
||||
|
@ -172,10 +178,6 @@ Improvements
|
|||
earlier than regular queries in order to improve cache efficiency.
|
||||
(Adrien Grand)
|
||||
|
||||
* LUCENE-7707: Use predefined shard index when mergeing top docs if present. This
|
||||
allows to use TopDoc#merge to merge shard responses incrementally instead of
|
||||
once all shard responses are present. (Simon Willnauer)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-7641: Optimized point range queries to compute documents that do not
|
||||
|
|
|
@ -52,18 +52,18 @@ public class FieldDoc extends ScoreDoc {
|
|||
|
||||
/** Expert: Creates one of these objects with empty sort information. */
|
||||
public FieldDoc(int doc, float score) {
|
||||
super (doc, score);
|
||||
super(doc, score);
|
||||
}
|
||||
|
||||
/** Expert: Creates one of these objects with the given sort information. */
|
||||
public FieldDoc(int doc, float score, Object[] fields) {
|
||||
super (doc, score);
|
||||
super(doc, score);
|
||||
this.fields = fields;
|
||||
}
|
||||
|
||||
/** Expert: Creates one of these objects with the given sort information. */
|
||||
public FieldDoc(int doc, float score, Object[] fields, int shardIndex) {
|
||||
super (doc, score, shardIndex);
|
||||
super(doc, score, shardIndex);
|
||||
this.fields = fields;
|
||||
}
|
||||
|
||||
|
|
|
@ -432,7 +432,7 @@ public class IndexSearcher {
|
|||
for (TopScoreDocCollector collector : collectors) {
|
||||
topDocs[i++] = collector.topDocs();
|
||||
}
|
||||
return TopDocs.merge(cappedNumHits, topDocs);
|
||||
return TopDocs.merge(0, cappedNumHits, topDocs, true);
|
||||
}
|
||||
|
||||
};
|
||||
|
@ -559,7 +559,7 @@ public class IndexSearcher {
|
|||
for (TopFieldCollector collector : collectors) {
|
||||
topDocs[i++] = collector.topDocs();
|
||||
}
|
||||
return TopDocs.merge(sort, cappedNumHits, topDocs);
|
||||
return TopDocs.merge(sort, 0, cappedNumHits, topDocs, true);
|
||||
}
|
||||
|
||||
};
|
||||
|
|
|
@ -16,7 +16,6 @@
|
|||
*/
|
||||
package org.apache.lucene.search;
|
||||
|
||||
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
/** Represents hits returned by {@link
|
||||
|
@ -60,6 +59,8 @@ public class TopDocs {
|
|||
private final static class ShardRef {
|
||||
// Which shard (index into shardHits[]):
|
||||
final int shardIndex;
|
||||
|
||||
// True if we should use the incoming ScoreDoc.shardIndex for sort order
|
||||
final boolean useScoreDocIndex;
|
||||
|
||||
// Which hit within the shard:
|
||||
|
@ -77,10 +78,12 @@ public class TopDocs {
|
|||
|
||||
int getShardIndex(ScoreDoc scoreDoc) {
|
||||
if (useScoreDocIndex) {
|
||||
assert scoreDoc.shardIndex != -1 : "scoreDoc shardIndex must be predefined set but wasn't";
|
||||
if (scoreDoc.shardIndex == -1) {
|
||||
throw new IllegalArgumentException("setShardIndex is false but TopDocs[" + shardIndex + "].scoreDocs[" + hitIndex + "] is not set");
|
||||
}
|
||||
return scoreDoc.shardIndex;
|
||||
} else {
|
||||
assert scoreDoc.shardIndex == -1 : "scoreDoc shardIndex must be undefined but wasn't";
|
||||
// NOTE: we don't assert that shardIndex is -1 here, because caller could in fact have set it but asked us to ignore it now
|
||||
return shardIndex;
|
||||
}
|
||||
}
|
||||
|
@ -201,23 +204,25 @@ public class TopDocs {
|
|||
* the provided TopDocs, sorting by score. Each {@link TopDocs}
|
||||
* instance must be sorted.
|
||||
*
|
||||
* @see #merge(int, int, TopDocs[])
|
||||
* @see #merge(int, int, TopDocs[], boolean)
|
||||
* @lucene.experimental */
|
||||
public static TopDocs merge(int topN, TopDocs[] shardHits) {
|
||||
return merge(0, topN, shardHits);
|
||||
return merge(0, topN, shardHits, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Same as {@link #merge(int, TopDocs[])} but also ignores the top
|
||||
* {@code start} top docs. This is typically useful for pagination.
|
||||
*
|
||||
* Note: This method will fill the {@link ScoreDoc#shardIndex} on all score docs returned iff all ScoreDocs passed
|
||||
* to this have it's shard index set to <tt>-1</tt>. Otherwise the shard index is not set. This allows to predefine
|
||||
* the shard index in order to incrementally merge shard responses without losing the original shard index.
|
||||
* Note: If {@code setShardIndex} is true, this method will assume the incoming order of {@code shardHits} reflects
|
||||
* each shard's index and will fill the {@link ScoreDoc#shardIndex}, otherwise
|
||||
* it must already be set for all incoming {@code ScoreDoc}s, which can be useful when doing multiple reductions
|
||||
* (merges) of TopDocs.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public static TopDocs merge(int start, int topN, TopDocs[] shardHits) {
|
||||
return mergeAux(null, start, topN, shardHits);
|
||||
public static TopDocs merge(int start, int topN, TopDocs[] shardHits, boolean setShardIndex) {
|
||||
return mergeAux(null, start, topN, shardHits, setShardIndex);
|
||||
}
|
||||
|
||||
/** Returns a new TopFieldDocs, containing topN results across
|
||||
|
@ -226,31 +231,34 @@ public class TopDocs {
|
|||
* the same Sort, and sort field values must have been
|
||||
* filled (ie, <code>fillFields=true</code> must be
|
||||
* passed to {@link TopFieldCollector#create}).
|
||||
* @see #merge(Sort, int, int, TopFieldDocs[])
|
||||
* @see #merge(Sort, int, int, TopFieldDocs[], boolean)
|
||||
* @lucene.experimental */
|
||||
public static TopFieldDocs merge(Sort sort, int topN, TopFieldDocs[] shardHits) {
|
||||
return merge(sort, 0, topN, shardHits);
|
||||
return merge(sort, 0, topN, shardHits, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Same as {@link #merge(Sort, int, TopFieldDocs[])} but also ignores the top
|
||||
* {@code start} top docs. This is typically useful for pagination.
|
||||
*
|
||||
* Note: This method will fill the {@link ScoreDoc#shardIndex} on all score docs returned iff all ScoreDocs passed
|
||||
* to this have it's shard index set to <tt>-1</tt>. Otherwise the shard index is not set. This allows to predefine
|
||||
* the shard index in order to incrementally merge shard responses without losing the original shard index.
|
||||
* Note: If {@code setShardIndex} is true, this method will assume the incoming order of {@code shardHits} reflects
|
||||
* each shard's index and will fill the {@link ScoreDoc#shardIndex}, otherwise
|
||||
* it must already be set for all incoming {@code ScoreDoc}s, which can be useful when doing multiple reductions
|
||||
* (merges) of TopDocs.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public static TopFieldDocs merge(Sort sort, int start, int topN, TopFieldDocs[] shardHits) {
|
||||
public static TopFieldDocs merge(Sort sort, int start, int topN, TopFieldDocs[] shardHits, boolean setShardIndex) {
|
||||
if (sort == null) {
|
||||
throw new IllegalArgumentException("sort must be non-null when merging field-docs");
|
||||
}
|
||||
return (TopFieldDocs) mergeAux(sort, start, topN, shardHits);
|
||||
return (TopFieldDocs) mergeAux(sort, start, topN, shardHits, setShardIndex);
|
||||
}
|
||||
|
||||
/** Auxiliary method used by the {@link #merge} impls. A sort value of null
|
||||
* is used to indicate that docs should be sorted by score. */
|
||||
private static TopDocs mergeAux(Sort sort, int start, int size, TopDocs[] shardHits) {
|
||||
private static TopDocs mergeAux(Sort sort, int start, int size, TopDocs[] shardHits, boolean setShardIndex) {
|
||||
|
||||
final PriorityQueue<ShardRef> queue;
|
||||
if (sort == null) {
|
||||
queue = new ScoreMergeSortQueue(shardHits);
|
||||
|
@ -261,28 +269,15 @@ public class TopDocs {
|
|||
int totalHitCount = 0;
|
||||
int availHitCount = 0;
|
||||
float maxScore = Float.MIN_VALUE;
|
||||
Boolean setShardIndex = null;
|
||||
for(int shardIDX=0;shardIDX<shardHits.length;shardIDX++) {
|
||||
final TopDocs shard = shardHits[shardIDX];
|
||||
// totalHits can be non-zero even if no hits were
|
||||
// collected, when searchAfter was used:
|
||||
totalHitCount += shard.totalHits;
|
||||
if (shard.scoreDocs != null && shard.scoreDocs.length > 0) {
|
||||
if (shard.scoreDocs[0].shardIndex == -1) {
|
||||
if (setShardIndex != null && setShardIndex == false) {
|
||||
throw new IllegalStateException("scoreDocs at index " + shardIDX + " has undefined shard indices but previous scoreDocs were predefined");
|
||||
}
|
||||
setShardIndex = true;
|
||||
} else {
|
||||
if (setShardIndex != null && setShardIndex) {
|
||||
throw new IllegalStateException("scoreDocs at index " + shardIDX + " has predefined shard indices but previous scoreDocs were undefined");
|
||||
}
|
||||
setShardIndex = false;
|
||||
}
|
||||
availHitCount += shard.scoreDocs.length;
|
||||
queue.add(new ShardRef(shardIDX, setShardIndex == false));
|
||||
maxScore = Math.max(maxScore, shard.getMaxScore());
|
||||
//System.out.println(" maxScore now " + maxScore + " vs " + shard.getMaxScore());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -303,19 +298,16 @@ public class TopDocs {
|
|||
ShardRef ref = queue.top();
|
||||
final ScoreDoc hit = shardHits[ref.shardIndex].scoreDocs[ref.hitIndex++];
|
||||
if (setShardIndex) {
|
||||
// unless this index is already initialized potentially due to multiple merge phases, or explicitly by the user
|
||||
// we set the shard index to the index of the TopDocs array this hit is coming from.
|
||||
// this allows multiple merge phases if needed but requires extra accounting on the users end.
|
||||
// at the same time this is fully backwards compatible since the value was initialize to -1 from the beginning
|
||||
// caller asked us to record shardIndex (index of the TopDocs array) this hit is coming from:
|
||||
hit.shardIndex = ref.shardIndex;
|
||||
} else if (hit.shardIndex == -1) {
|
||||
throw new IllegalArgumentException("setShardIndex is false but TopDocs[" + ref.shardIndex + "].scoreDocs[" + (ref.hitIndex-1) + "] is not set");
|
||||
}
|
||||
|
||||
if (hitUpto >= start) {
|
||||
hits[hitUpto - start] = hit;
|
||||
}
|
||||
|
||||
//System.out.println(" hitUpto=" + hitUpto);
|
||||
//System.out.println(" doc=" + hits[hitUpto].doc + " score=" + hits[hitUpto].score);
|
||||
|
||||
hitUpto++;
|
||||
|
||||
if (ref.hitIndex < shardHits[ref.shardIndex].scoreDocs.length) {
|
||||
|
|
|
@ -17,15 +17,22 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FloatDocValuesField;
|
||||
import org.apache.lucene.document.NumericDocValuesField;
|
||||
import org.apache.lucene.document.SortedDocValuesField;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.CompositeReaderContext;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexReaderContext;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.ReaderUtil;
|
||||
import org.apache.lucene.index.Term;
|
||||
|
@ -35,13 +42,6 @@ import org.apache.lucene.util.BytesRef;
|
|||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class TestTopDocsMerge extends LuceneTestCase {
|
||||
|
||||
private static class ShardSearcher extends IndexSearcher {
|
||||
|
@ -77,18 +77,18 @@ public class TestTopDocsMerge extends LuceneTestCase {
|
|||
|
||||
public void testInconsistentTopDocsFail() {
|
||||
TopDocs[] topDocs = new TopDocs[] {
|
||||
new TopDocs(1, new ScoreDoc[] { new ScoreDoc(1, 1.0f, 1) }),
|
||||
new TopDocs(1, new ScoreDoc[] { new ScoreDoc(1, 1.0f) }),
|
||||
new TopDocs(1, new ScoreDoc[] { new ScoreDoc(1, 1.0f, -1) })
|
||||
};
|
||||
if (random().nextBoolean()) {
|
||||
ArrayUtil.swap(topDocs, 0, 1);
|
||||
}
|
||||
expectThrows(IllegalStateException.class, () -> {
|
||||
TopDocs.merge(0, 1, topDocs);
|
||||
expectThrows(IllegalArgumentException.class, () -> {
|
||||
TopDocs.merge(0, 1, topDocs, false);
|
||||
});
|
||||
}
|
||||
|
||||
public void testAssignShardIndex() {
|
||||
public void testPreAssignedShardIndex() {
|
||||
boolean useConstantScore = random().nextBoolean();
|
||||
int numTopDocs = 2 + random().nextInt(10);
|
||||
ArrayList<TopDocs> topDocs = new ArrayList<>(numTopDocs);
|
||||
|
@ -100,8 +100,8 @@ public class TestTopDocsMerge extends LuceneTestCase {
|
|||
ScoreDoc[] scoreDocs = new ScoreDoc[numHits];
|
||||
for (int j = 0; j < scoreDocs.length; j++) {
|
||||
float score = useConstantScore ? 1.0f : random().nextFloat();
|
||||
scoreDocs[j] = new ScoreDoc((100 * i) + j, score , i);
|
||||
// we set the shard index to index in the list here but shuffle the entire list below
|
||||
scoreDocs[j] = new ScoreDoc((100 * i) + j, score , i);
|
||||
}
|
||||
topDocs.add(new TopDocs(numHits, scoreDocs));
|
||||
shardResultMapping.put(i, topDocs.get(i));
|
||||
|
@ -111,7 +111,11 @@ public class TestTopDocsMerge extends LuceneTestCase {
|
|||
Collections.shuffle(topDocs, random());
|
||||
final int from = random().nextInt(numHitsTotal-1);
|
||||
final int size = 1 + random().nextInt(numHitsTotal - from);
|
||||
TopDocs merge = TopDocs.merge(from, size, topDocs.toArray(new TopDocs[0]));
|
||||
|
||||
// passing false here means TopDocs.merge uses the incoming ScoreDoc.shardIndex
|
||||
// that we already set, instead of the position of that TopDocs in the array:
|
||||
TopDocs merge = TopDocs.merge(from, size, topDocs.toArray(new TopDocs[0]), false);
|
||||
|
||||
assertTrue(merge.scoreDocs.length > 0);
|
||||
for (ScoreDoc scoreDoc : merge.scoreDocs) {
|
||||
assertTrue(scoreDoc.shardIndex != -1);
|
||||
|
@ -129,7 +133,7 @@ public class TestTopDocsMerge extends LuceneTestCase {
|
|||
|
||||
// now ensure merge is stable even if we use our own shard IDs
|
||||
Collections.shuffle(topDocs, random());
|
||||
TopDocs merge2 = TopDocs.merge(from, size, topDocs.toArray(new TopDocs[0]));
|
||||
TopDocs merge2 = TopDocs.merge(from, size, topDocs.toArray(new TopDocs[0]), false);
|
||||
assertArrayEquals(merge.scoreDocs, merge2.scoreDocs);
|
||||
}
|
||||
|
||||
|
@ -346,9 +350,9 @@ public class TestTopDocsMerge extends LuceneTestCase {
|
|||
final TopDocs mergedHits;
|
||||
if (useFrom) {
|
||||
if (sort == null) {
|
||||
mergedHits = TopDocs.merge(from, size, shardHits);
|
||||
mergedHits = TopDocs.merge(from, size, shardHits, true);
|
||||
} else {
|
||||
mergedHits = TopDocs.merge(sort, from, size, (TopFieldDocs[]) shardHits);
|
||||
mergedHits = TopDocs.merge(sort, from, size, (TopFieldDocs[]) shardHits, true);
|
||||
}
|
||||
} else {
|
||||
if (sort == null) {
|
||||
|
|
Loading…
Reference in New Issue