LUCENE-7707: add explicit boolean to TopDocs.merge to govern whether incoming or implicit shard index should be used

This commit is contained in:
Mike McCandless 2017-02-24 17:00:45 -05:00
parent 471d842742
commit 2e56c0e505
5 changed files with 62 additions and 64 deletions

View File

@ -97,6 +97,12 @@ API Changes
* LUCENE-7702: Removed GraphQuery in favour of simple boolean query. (Matt Webber via Jim Ferenczi)
* LUCENE-7707: TopDocs.merge now takes a boolean option telling it
when to use the incoming shard index versus when to assign the shard
index itself, allowing users to merge shard responses incrementally
instead of once all shard responses are present. (Simon Willnauer,
Mike McCandless)
New Features
* LUCENE-7449: Add CROSSES relation support to RangeFieldQuery. (Nick Knize)
@ -172,10 +178,6 @@ Improvements
earlier than regular queries in order to improve cache efficiency.
(Adrien Grand)
* LUCENE-7707: Use predefined shard index when mergeing top docs if present. This
allows to use TopDoc#merge to merge shard responses incrementally instead of
once all shard responses are present. (Simon Willnauer)
Optimizations
* LUCENE-7641: Optimized point range queries to compute documents that do not

View File

@ -52,18 +52,18 @@ public class FieldDoc extends ScoreDoc {
/** Expert: Creates one of these objects with empty sort information. */
public FieldDoc(int doc, float score) {
super (doc, score);
super(doc, score);
}
/** Expert: Creates one of these objects with the given sort information. */
public FieldDoc(int doc, float score, Object[] fields) {
super (doc, score);
super(doc, score);
this.fields = fields;
}
/** Expert: Creates one of these objects with the given sort information. */
public FieldDoc(int doc, float score, Object[] fields, int shardIndex) {
super (doc, score, shardIndex);
super(doc, score, shardIndex);
this.fields = fields;
}

View File

@ -432,7 +432,7 @@ public class IndexSearcher {
for (TopScoreDocCollector collector : collectors) {
topDocs[i++] = collector.topDocs();
}
return TopDocs.merge(cappedNumHits, topDocs);
return TopDocs.merge(0, cappedNumHits, topDocs, true);
}
};
@ -559,7 +559,7 @@ public class IndexSearcher {
for (TopFieldCollector collector : collectors) {
topDocs[i++] = collector.topDocs();
}
return TopDocs.merge(sort, cappedNumHits, topDocs);
return TopDocs.merge(sort, 0, cappedNumHits, topDocs, true);
}
};

View File

@ -16,7 +16,6 @@
*/
package org.apache.lucene.search;
import org.apache.lucene.util.PriorityQueue;
/** Represents hits returned by {@link
@ -60,6 +59,8 @@ public class TopDocs {
private final static class ShardRef {
// Which shard (index into shardHits[]):
final int shardIndex;
// True if we should use the incoming ScoreDoc.shardIndex for sort order
final boolean useScoreDocIndex;
// Which hit within the shard:
@ -77,10 +78,12 @@ public class TopDocs {
int getShardIndex(ScoreDoc scoreDoc) {
if (useScoreDocIndex) {
assert scoreDoc.shardIndex != -1 : "scoreDoc shardIndex must be predefined set but wasn't";
if (scoreDoc.shardIndex == -1) {
throw new IllegalArgumentException("setShardIndex is false but TopDocs[" + shardIndex + "].scoreDocs[" + hitIndex + "] is not set");
}
return scoreDoc.shardIndex;
} else {
assert scoreDoc.shardIndex == -1 : "scoreDoc shardIndex must be undefined but wasn't";
// NOTE: we don't assert that shardIndex is -1 here, because caller could in fact have set it but asked us to ignore it now
return shardIndex;
}
}
@ -201,23 +204,25 @@ public class TopDocs {
* the provided TopDocs, sorting by score. Each {@link TopDocs}
* instance must be sorted.
*
* @see #merge(int, int, TopDocs[])
* @see #merge(int, int, TopDocs[], boolean)
* @lucene.experimental */
public static TopDocs merge(int topN, TopDocs[] shardHits) {
return merge(0, topN, shardHits);
return merge(0, topN, shardHits, true);
}
/**
* Same as {@link #merge(int, TopDocs[])} but also ignores the top
* {@code start} top docs. This is typically useful for pagination.
*
* Note: This method will fill the {@link ScoreDoc#shardIndex} on all score docs returned iff all ScoreDocs passed
* to this have it's shard index set to <tt>-1</tt>. Otherwise the shard index is not set. This allows to predefine
* the shard index in order to incrementally merge shard responses without losing the original shard index.
* Note: If {@code setShardIndex} is true, this method will assume the incoming order of {@code shardHits} reflects
* each shard's index and will fill the {@link ScoreDoc#shardIndex}, otherwise
* it must already be set for all incoming {@code ScoreDoc}s, which can be useful when doing multiple reductions
* (merges) of TopDocs.
*
* @lucene.experimental
*/
public static TopDocs merge(int start, int topN, TopDocs[] shardHits) {
return mergeAux(null, start, topN, shardHits);
public static TopDocs merge(int start, int topN, TopDocs[] shardHits, boolean setShardIndex) {
return mergeAux(null, start, topN, shardHits, setShardIndex);
}
/** Returns a new TopFieldDocs, containing topN results across
@ -226,31 +231,34 @@ public class TopDocs {
* the same Sort, and sort field values must have been
* filled (ie, <code>fillFields=true</code> must be
* passed to {@link TopFieldCollector#create}).
* @see #merge(Sort, int, int, TopFieldDocs[])
* @see #merge(Sort, int, int, TopFieldDocs[], boolean)
* @lucene.experimental */
public static TopFieldDocs merge(Sort sort, int topN, TopFieldDocs[] shardHits) {
return merge(sort, 0, topN, shardHits);
return merge(sort, 0, topN, shardHits, true);
}
/**
* Same as {@link #merge(Sort, int, TopFieldDocs[])} but also ignores the top
* {@code start} top docs. This is typically useful for pagination.
*
* Note: This method will fill the {@link ScoreDoc#shardIndex} on all score docs returned iff all ScoreDocs passed
* to this have it's shard index set to <tt>-1</tt>. Otherwise the shard index is not set. This allows to predefine
* the shard index in order to incrementally merge shard responses without losing the original shard index.
* Note: If {@code setShardIndex} is true, this method will assume the incoming order of {@code shardHits} reflects
* each shard's index and will fill the {@link ScoreDoc#shardIndex}, otherwise
* it must already be set for all incoming {@code ScoreDoc}s, which can be useful when doing multiple reductions
* (merges) of TopDocs.
*
* @lucene.experimental
*/
public static TopFieldDocs merge(Sort sort, int start, int topN, TopFieldDocs[] shardHits) {
public static TopFieldDocs merge(Sort sort, int start, int topN, TopFieldDocs[] shardHits, boolean setShardIndex) {
if (sort == null) {
throw new IllegalArgumentException("sort must be non-null when merging field-docs");
}
return (TopFieldDocs) mergeAux(sort, start, topN, shardHits);
return (TopFieldDocs) mergeAux(sort, start, topN, shardHits, setShardIndex);
}
/** Auxiliary method used by the {@link #merge} impls. A sort value of null
* is used to indicate that docs should be sorted by score. */
private static TopDocs mergeAux(Sort sort, int start, int size, TopDocs[] shardHits) {
private static TopDocs mergeAux(Sort sort, int start, int size, TopDocs[] shardHits, boolean setShardIndex) {
final PriorityQueue<ShardRef> queue;
if (sort == null) {
queue = new ScoreMergeSortQueue(shardHits);
@ -261,28 +269,15 @@ public class TopDocs {
int totalHitCount = 0;
int availHitCount = 0;
float maxScore = Float.MIN_VALUE;
Boolean setShardIndex = null;
for(int shardIDX=0;shardIDX<shardHits.length;shardIDX++) {
final TopDocs shard = shardHits[shardIDX];
// totalHits can be non-zero even if no hits were
// collected, when searchAfter was used:
totalHitCount += shard.totalHits;
if (shard.scoreDocs != null && shard.scoreDocs.length > 0) {
if (shard.scoreDocs[0].shardIndex == -1) {
if (setShardIndex != null && setShardIndex == false) {
throw new IllegalStateException("scoreDocs at index " + shardIDX + " has undefined shard indices but previous scoreDocs were predefined");
}
setShardIndex = true;
} else {
if (setShardIndex != null && setShardIndex) {
throw new IllegalStateException("scoreDocs at index " + shardIDX + " has predefined shard indices but previous scoreDocs were undefined");
}
setShardIndex = false;
}
availHitCount += shard.scoreDocs.length;
queue.add(new ShardRef(shardIDX, setShardIndex == false));
maxScore = Math.max(maxScore, shard.getMaxScore());
//System.out.println(" maxScore now " + maxScore + " vs " + shard.getMaxScore());
}
}
@ -303,19 +298,16 @@ public class TopDocs {
ShardRef ref = queue.top();
final ScoreDoc hit = shardHits[ref.shardIndex].scoreDocs[ref.hitIndex++];
if (setShardIndex) {
// unless this index is already initialized potentially due to multiple merge phases, or explicitly by the user
// we set the shard index to the index of the TopDocs array this hit is coming from.
// this allows multiple merge phases if needed but requires extra accounting on the users end.
// at the same time this is fully backwards compatible since the value was initialize to -1 from the beginning
// caller asked us to record shardIndex (index of the TopDocs array) this hit is coming from:
hit.shardIndex = ref.shardIndex;
} else if (hit.shardIndex == -1) {
throw new IllegalArgumentException("setShardIndex is false but TopDocs[" + ref.shardIndex + "].scoreDocs[" + (ref.hitIndex-1) + "] is not set");
}
if (hitUpto >= start) {
hits[hitUpto - start] = hit;
}
//System.out.println(" hitUpto=" + hitUpto);
//System.out.println(" doc=" + hits[hitUpto].doc + " score=" + hits[hitUpto].score);
hitUpto++;
if (ref.hitIndex < shardHits[ref.shardIndex].scoreDocs.length) {

View File

@ -17,15 +17,22 @@
package org.apache.lucene.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FloatDocValuesField;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.CompositeReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.Term;
@ -35,13 +42,6 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class TestTopDocsMerge extends LuceneTestCase {
private static class ShardSearcher extends IndexSearcher {
@ -77,18 +77,18 @@ public class TestTopDocsMerge extends LuceneTestCase {
public void testInconsistentTopDocsFail() {
TopDocs[] topDocs = new TopDocs[] {
new TopDocs(1, new ScoreDoc[] { new ScoreDoc(1, 1.0f, 1) }),
new TopDocs(1, new ScoreDoc[] { new ScoreDoc(1, 1.0f) }),
new TopDocs(1, new ScoreDoc[] { new ScoreDoc(1, 1.0f, -1) })
};
if (random().nextBoolean()) {
ArrayUtil.swap(topDocs, 0, 1);
}
expectThrows(IllegalStateException.class, () -> {
TopDocs.merge(0, 1, topDocs);
expectThrows(IllegalArgumentException.class, () -> {
TopDocs.merge(0, 1, topDocs, false);
});
}
public void testAssignShardIndex() {
public void testPreAssignedShardIndex() {
boolean useConstantScore = random().nextBoolean();
int numTopDocs = 2 + random().nextInt(10);
ArrayList<TopDocs> topDocs = new ArrayList<>(numTopDocs);
@ -100,8 +100,8 @@ public class TestTopDocsMerge extends LuceneTestCase {
ScoreDoc[] scoreDocs = new ScoreDoc[numHits];
for (int j = 0; j < scoreDocs.length; j++) {
float score = useConstantScore ? 1.0f : random().nextFloat();
scoreDocs[j] = new ScoreDoc((100 * i) + j, score , i);
// we set the shard index to index in the list here but shuffle the entire list below
scoreDocs[j] = new ScoreDoc((100 * i) + j, score , i);
}
topDocs.add(new TopDocs(numHits, scoreDocs));
shardResultMapping.put(i, topDocs.get(i));
@ -111,7 +111,11 @@ public class TestTopDocsMerge extends LuceneTestCase {
Collections.shuffle(topDocs, random());
final int from = random().nextInt(numHitsTotal-1);
final int size = 1 + random().nextInt(numHitsTotal - from);
TopDocs merge = TopDocs.merge(from, size, topDocs.toArray(new TopDocs[0]));
// passing false here means TopDocs.merge uses the incoming ScoreDoc.shardIndex
// that we already set, instead of the position of that TopDocs in the array:
TopDocs merge = TopDocs.merge(from, size, topDocs.toArray(new TopDocs[0]), false);
assertTrue(merge.scoreDocs.length > 0);
for (ScoreDoc scoreDoc : merge.scoreDocs) {
assertTrue(scoreDoc.shardIndex != -1);
@ -129,7 +133,7 @@ public class TestTopDocsMerge extends LuceneTestCase {
// now ensure merge is stable even if we use our own shard IDs
Collections.shuffle(topDocs, random());
TopDocs merge2 = TopDocs.merge(from, size, topDocs.toArray(new TopDocs[0]));
TopDocs merge2 = TopDocs.merge(from, size, topDocs.toArray(new TopDocs[0]), false);
assertArrayEquals(merge.scoreDocs, merge2.scoreDocs);
}
@ -346,9 +350,9 @@ public class TestTopDocsMerge extends LuceneTestCase {
final TopDocs mergedHits;
if (useFrom) {
if (sort == null) {
mergedHits = TopDocs.merge(from, size, shardHits);
mergedHits = TopDocs.merge(from, size, shardHits, true);
} else {
mergedHits = TopDocs.merge(sort, from, size, (TopFieldDocs[]) shardHits);
mergedHits = TopDocs.merge(sort, from, size, (TopFieldDocs[]) shardHits, true);
}
} else {
if (sort == null) {