diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 3961006ec10..877fcece9ac 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -25,10 +25,6 @@ API Changes * LUCENE-8811: Deprecated BooleanQuery#setMaxClauseCount() and getMaxClauseCount have been removed (Atri Sharma, Adrien Grand, Alan Woodward) -* LUCENE-8857: Introduce Custom Tiebreakers in TopDocs.merge for tie breaking on - docs on equal scores. Also, remove the ability of TopDocs.merge to set shard - indices (Atri Sharma, Adrien Grand, Simon Willnauer) - Bug fixes * LUCENE-8663: NRTCachingDirectory.slowFileExists may open a file while diff --git a/lucene/MIGRATE.txt b/lucene/MIGRATE.txt index 7cb7dd97355..4752787b99f 100644 --- a/lucene/MIGRATE.txt +++ b/lucene/MIGRATE.txt @@ -182,11 +182,3 @@ to concurrent changes. ## maxClausesCount moved from BooleanQuery To IndexSearcher (LUCENE-8811) ## IndexSearcher now performs max clause count checks on all types of queries (including BooleanQueries). This led to a logical move of the clauses count from BooleanQuery to IndexSearcher. - -## TopDocs.merge shall no longer allow setting of shard indices ## - -TopDocs.merge's API has been changed to stop allowing passing in a parameter to indicate if it should -set shard indices for hits as they are seen during the merge process. This is done to simplify the API -to be more dynamic in terms of passing in custom tie breakers. -If shard indices are to be used for tie breaking docs with equal scores during TopDocs.merge, then it is -mandatory that the input ScoreDocs have their shard indices set to valid values prior to calling TopDocs.merge diff --git a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java index 3b2ab474d39..84974a6239e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java @@ -479,7 +479,7 @@ public class IndexSearcher { for (TopScoreDocCollector collector : collectors) { topDocs[i++] = collector.topDocs(); } - return TopDocs.merge(0, cappedNumHits, topDocs); + return TopDocs.merge(0, cappedNumHits, topDocs, true); } }; @@ -608,7 +608,7 @@ public class IndexSearcher { for (TopFieldCollector collector : collectors) { topDocs[i++] = collector.topDocs(); } - return TopDocs.merge(rewrittenSort, 0, cappedNumHits, topDocs); + return TopDocs.merge(rewrittenSort, 0, cappedNumHits, topDocs, true); } }; diff --git a/lucene/core/src/java/org/apache/lucene/search/TopDocs.java b/lucene/core/src/java/org/apache/lucene/search/TopDocs.java index 0439b882afc..d4668930781 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TopDocs.java +++ b/lucene/core/src/java/org/apache/lucene/search/TopDocs.java @@ -16,8 +16,6 @@ */ package org.apache.lucene.search; -import java.util.Comparator; - import org.apache.lucene.util.PriorityQueue; /** Represents hits returned by {@link @@ -30,15 +28,6 @@ public class TopDocs { /** The top hits for the query. */ public ScoreDoc[] scoreDocs; - /** Internal comparator with shardIndex */ - private static final Comparator SHARD_INDEX_TIE_BREAKER = Comparator.comparingInt(d -> d.shardIndex); - - /** Internal comparator with docID */ - private static final Comparator DOC_ID_TIE_BREAKER = Comparator.comparingInt(d -> d.doc); - - /** Default comparator */ - private static final Comparator DEFAULT_TIE_BREAKER = SHARD_INDEX_TIE_BREAKER.thenComparing(DOC_ID_TIE_BREAKER); - /** Constructs a TopDocs. */ public TopDocs(TotalHits totalHits, ScoreDoc[] scoreDocs) { this.totalHits = totalHits; @@ -50,52 +39,66 @@ public class TopDocs { // Which shard (index into shardHits[]): final int shardIndex; + // True if we should use the incoming ScoreDoc.shardIndex for sort order + final boolean useScoreDocIndex; + // Which hit within the shard: int hitIndex; - ShardRef(int shardIndex) { + ShardRef(int shardIndex, boolean useScoreDocIndex) { this.shardIndex = shardIndex; + this.useScoreDocIndex = useScoreDocIndex; } @Override public String toString() { return "ShardRef(shardIndex=" + shardIndex + " hitIndex=" + hitIndex + ")"; } + + int getShardIndex(ScoreDoc scoreDoc) { + if (useScoreDocIndex) { + if (scoreDoc.shardIndex == -1) { + throw new IllegalArgumentException("setShardIndex is false but TopDocs[" + shardIndex + "].scoreDocs[" + hitIndex + "] is not set"); + } + return scoreDoc.shardIndex; + } else { + // NOTE: we don't assert that shardIndex is -1 here, because caller could in fact have set it but asked us to ignore it now + return shardIndex; + } + } } /** - * Use the tie breaker if provided. If tie breaker returns 0 signifying equal values, we use hit indices - * to tie break intra shard ties + * if we need to tie-break since score / sort value are the same we first compare shard index (lower shard wins) + * and then iff shard index is the same we use the hit index. */ - static boolean tieBreakLessThan(ShardRef first, ScoreDoc firstDoc, ShardRef second, ScoreDoc secondDoc, - Comparator tieBreaker) { - assert tieBreaker != null; - int value = tieBreaker.compare(firstDoc, secondDoc); - - if (value == 0) { - // Equal Values + static boolean tieBreakLessThan(ShardRef first, ScoreDoc firstDoc, ShardRef second, ScoreDoc secondDoc) { + final int firstShardIndex = first.getShardIndex(firstDoc); + final int secondShardIndex = second.getShardIndex(secondDoc); + // Tie break: earlier shard wins + if (firstShardIndex< secondShardIndex) { + return true; + } else if (firstShardIndex > secondShardIndex) { + return false; + } else { // Tie break in same shard: resolve however the // shard had resolved it: assert first.hitIndex != second.hitIndex; return first.hitIndex < second.hitIndex; } - - return value < 0; } // Specialized MergeSortQueue that just merges by // relevance score, descending: private static class ScoreMergeSortQueue extends PriorityQueue { final ScoreDoc[][] shardHits; - final Comparator tieBreakerComparator; - public ScoreMergeSortQueue(TopDocs[] shardHits, Comparator tieBreakerComparator) { + public ScoreMergeSortQueue(TopDocs[] shardHits) { super(shardHits.length); this.shardHits = new ScoreDoc[shardHits.length][]; for(int shardIDX=0;shardIDX secondScoreDoc.score) { return true; } else { - return tieBreakLessThan(first, firstScoreDoc, second, secondScoreDoc, tieBreakerComparator); + return tieBreakLessThan(first, firstScoreDoc, second, secondScoreDoc); } } } @@ -120,12 +123,10 @@ public class TopDocs { final ScoreDoc[][] shardHits; final FieldComparator[] comparators; final int[] reverseMul; - final Comparator tieBreaker; - public MergeSortQueue(Sort sort, TopDocs[] shardHits, Comparator tieBreaker) { + public MergeSortQueue(Sort sort, TopDocs[] shardHits) { super(shardHits.length); this.shardHits = new ScoreDoc[shardHits.length][]; - this.tieBreaker = tieBreaker; for(int shardIDX=0;shardIDX tieBreaker) { - if (sort == null) { - throw new IllegalArgumentException("sort must be non-null when merging field-docs"); - } - return (TopFieldDocs) mergeAux(sort, start, topN, shardHits, tieBreaker); + return (TopFieldDocs) mergeAux(sort, start, topN, shardHits, setShardIndex); } /** Auxiliary method used by the {@link #merge} impls. A sort value of null * is used to indicate that docs should be sorted by score. */ - private static TopDocs mergeAux(Sort sort, int start, int size, TopDocs[] shardHits, - Comparator tieBreaker) { + private static TopDocs mergeAux(Sort sort, int start, int size, TopDocs[] shardHits, boolean setShardIndex) { final PriorityQueue queue; if (sort == null) { - queue = new ScoreMergeSortQueue(shardHits, tieBreaker); + queue = new ScoreMergeSortQueue(shardHits); } else { - queue = new MergeSortQueue(sort, shardHits, tieBreaker); + queue = new MergeSortQueue(sort, shardHits); } long totalHitCount = 0; @@ -281,12 +260,11 @@ public class TopDocs { } if (shard.scoreDocs != null && shard.scoreDocs.length > 0) { availHitCount += shard.scoreDocs.length; - queue.add(new ShardRef(shardIDX)); + queue.add(new ShardRef(shardIDX, setShardIndex == false)); } } final ScoreDoc[] hits; - boolean unsetShardIndex = false; if (availHitCount <= start) { hits = new ScoreDoc[0]; } else { @@ -298,16 +276,12 @@ public class TopDocs { assert queue.size() > 0; ShardRef ref = queue.top(); final ScoreDoc hit = shardHits[ref.shardIndex].scoreDocs[ref.hitIndex++]; - - // Irrespective of whether we use shard indices for tie breaking or not, we check for consistent - // order in shard indices to defend against potential bugs - if (hitUpto > 0) { - if (unsetShardIndex != (hit.shardIndex == -1)) { - throw new IllegalArgumentException("Inconsistent order of shard indices"); - } + if (setShardIndex) { + // caller asked us to record shardIndex (index of the TopDocs array) this hit is coming from: + hit.shardIndex = ref.shardIndex; + } else if (hit.shardIndex == -1) { + throw new IllegalArgumentException("setShardIndex is false but TopDocs[" + ref.shardIndex + "].scoreDocs[" + (ref.hitIndex-1) + "] is not set"); } - - unsetShardIndex |= hit.shardIndex == -1; if (hitUpto >= start) { hits[hitUpto - start] = hit; diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMultiSliceMerge.java b/lucene/core/src/test/org/apache/lucene/search/TestMultiSliceMerge.java deleted file mode 100644 index 0d23dc92480..00000000000 --- a/lucene/core/src/test/org/apache/lucene/search/TestMultiSliceMerge.java +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.search; - -import java.util.Random; -import java.util.concurrent.Executor; - -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.SortedDocValuesField; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.RandomIndexWriter; -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.LuceneTestCase; - -public class TestMultiSliceMerge extends LuceneTestCase { - Directory dir1; - Directory dir2; - IndexReader reader1; - IndexReader reader2; - - @Override - public void setUp() throws Exception { - super.setUp(); - dir1 = newDirectory(); - dir2 = newDirectory(); - Random random = random(); - RandomIndexWriter iw1 = new RandomIndexWriter(random(), dir1, newIndexWriterConfig().setMergePolicy(newLogMergePolicy())); - for (int i = 0; i < 100; i++) { - Document doc = new Document(); - doc.add(newStringField("field", Integer.toString(i), Field.Store.NO)); - doc.add(newStringField("field2", Boolean.toString(i % 2 == 0), Field.Store.NO)); - doc.add(new SortedDocValuesField("field2", new BytesRef(Boolean.toString(i % 2 == 0)))); - iw1.addDocument(doc); - - if (random.nextBoolean()) { - iw1.getReader().close(); - } - } - reader1 = iw1.getReader(); - iw1.close(); - - RandomIndexWriter iw2 = new RandomIndexWriter(random(), dir2, newIndexWriterConfig().setMergePolicy(newLogMergePolicy())); - for (int i = 0; i < 100; i++) { - Document doc = new Document(); - doc.add(newStringField("field", Integer.toString(i), Field.Store.NO)); - doc.add(newStringField("field2", Boolean.toString(i % 2 == 0), Field.Store.NO)); - doc.add(new SortedDocValuesField("field2", new BytesRef(Boolean.toString(i % 2 == 0)))); - iw2.addDocument(doc); - - if (random.nextBoolean()) { - iw2.commit(); - } - } - reader2 = iw2.getReader(); - iw2.close(); - } - - @Override - public void tearDown() throws Exception { - super.tearDown(); - reader1.close(); - reader2.close(); - dir1.close(); - dir2.close(); - } - - public void testMultipleSlicesOfSameIndexSearcher() throws Exception { - Executor executor1 = runnable -> runnable.run(); - Executor executor2 = runnable -> runnable.run(); - - IndexSearcher searchers[] = new IndexSearcher[] { - new IndexSearcher(reader1, executor1), - new IndexSearcher(reader2, executor2) - }; - - Query query = new MatchAllDocsQuery(); - - TopDocs topDocs1 = searchers[0].search(query, Integer.MAX_VALUE); - TopDocs topDocs2 = searchers[1].search(query, Integer.MAX_VALUE); - - CheckHits.checkEqual(query, topDocs1.scoreDocs, topDocs2.scoreDocs); - } - - public void testMultipleSlicesOfMultipleIndexSearchers() throws Exception { - Executor executor1 = runnable -> runnable.run(); - Executor executor2 = runnable -> runnable.run(); - - IndexSearcher searchers[] = new IndexSearcher[] { - new IndexSearcher(reader1, executor1), - new IndexSearcher(reader2, executor2) - }; - - Query query = new MatchAllDocsQuery(); - - TopDocs topDocs1 = searchers[0].search(query, Integer.MAX_VALUE); - TopDocs topDocs2 = searchers[1].search(query, Integer.MAX_VALUE); - - assertEquals(topDocs1.scoreDocs.length, topDocs2.scoreDocs.length); - - for (int i = 0; i < topDocs1.scoreDocs.length; i++) { - topDocs1.scoreDocs[i].shardIndex = 0; - topDocs2.scoreDocs[i].shardIndex = 1; - } - - TopDocs[] shardHits = {topDocs1, topDocs2}; - - TopDocs mergedHits1 = TopDocs.merge(0, topDocs1.scoreDocs.length, shardHits); - TopDocs mergedHits2 = TopDocs.merge(0, topDocs1.scoreDocs.length, shardHits); - - CheckHits.checkEqual(query, mergedHits1.scoreDocs, mergedHits2.scoreDocs); - } -} diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTopDocsMerge.java b/lucene/core/src/test/org/apache/lucene/search/TestTopDocsMerge.java index 23f0c97be4f..db8b10503cb 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestTopDocsMerge.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestTopDocsMerge.java @@ -77,14 +77,14 @@ public class TestTopDocsMerge extends LuceneTestCase { public void testInconsistentTopDocsFail() { TopDocs[] topDocs = new TopDocs[] { - new TopDocs(new TotalHits(1, TotalHits.Relation.EQUAL_TO), new ScoreDoc[] { new ScoreDoc(1, 1.0f, 5) }), + new TopDocs(new TotalHits(1, TotalHits.Relation.EQUAL_TO), new ScoreDoc[] { new ScoreDoc(1, 1.0f) }), new TopDocs(new TotalHits(1, TotalHits.Relation.EQUAL_TO), new ScoreDoc[] { new ScoreDoc(1, 1.0f, -1) }) }; if (random().nextBoolean()) { ArrayUtil.swap(topDocs, 0, 1); } expectThrows(IllegalArgumentException.class, () -> { - TopDocs.merge(0, 2, topDocs); + TopDocs.merge(0, 1, topDocs, false); }); } @@ -114,7 +114,7 @@ public class TestTopDocsMerge extends LuceneTestCase { // passing false here means TopDocs.merge uses the incoming ScoreDoc.shardIndex // that we already set, instead of the position of that TopDocs in the array: - TopDocs merge = TopDocs.merge(from, size, topDocs.toArray(new TopDocs[0])); + TopDocs merge = TopDocs.merge(from, size, topDocs.toArray(new TopDocs[0]), false); assertTrue(merge.scoreDocs.length > 0); for (ScoreDoc scoreDoc : merge.scoreDocs) { @@ -133,7 +133,7 @@ public class TestTopDocsMerge extends LuceneTestCase { // now ensure merge is stable even if we use our own shard IDs Collections.shuffle(topDocs, random()); - TopDocs merge2 = TopDocs.merge(from, size, topDocs.toArray(new TopDocs[0])); + TopDocs merge2 = TopDocs.merge(from, size, topDocs.toArray(new TopDocs[0]), false); assertArrayEquals(merge.scoreDocs, merge2.scoreDocs); } @@ -335,10 +335,6 @@ public class TestTopDocsMerge extends LuceneTestCase { subHits = c.topDocs(0, numHits); } - for (int i = 0; i < subHits.scoreDocs.length; i++) { - subHits.scoreDocs[i].shardIndex = shardIDX; - } - shardHits[shardIDX] = subHits; if (VERBOSE) { System.out.println(" shard=" + shardIDX + " " + subHits.totalHits.value + " totalHits hits=" + (subHits.scoreDocs == null ? "null" : subHits.scoreDocs.length)); @@ -354,9 +350,9 @@ public class TestTopDocsMerge extends LuceneTestCase { final TopDocs mergedHits; if (useFrom) { if (sort == null) { - mergedHits = TopDocs.merge(from, size, shardHits); + mergedHits = TopDocs.merge(from, size, shardHits, true); } else { - mergedHits = TopDocs.merge(sort, from, size, (TopFieldDocs[]) shardHits); + mergedHits = TopDocs.merge(sort, from, size, (TopFieldDocs[]) shardHits, true); } } else { if (sort == null) { @@ -383,10 +379,10 @@ public class TestTopDocsMerge extends LuceneTestCase { } public void testMergeTotalHitsRelation() { - TopDocs topDocs1 = new TopDocs(new TotalHits(2, TotalHits.Relation.EQUAL_TO), new ScoreDoc[] { new ScoreDoc(42, 2f, 0) }); - TopDocs topDocs2 = new TopDocs(new TotalHits(1, TotalHits.Relation.EQUAL_TO), new ScoreDoc[] { new ScoreDoc(42, 2f, 1) }); - TopDocs topDocs3 = new TopDocs(new TotalHits(1, TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO), new ScoreDoc[] { new ScoreDoc(42, 2f, 2) }); - TopDocs topDocs4 = new TopDocs(new TotalHits(3, TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO), new ScoreDoc[] { new ScoreDoc(42, 2f, 3) }); + TopDocs topDocs1 = new TopDocs(new TotalHits(2, TotalHits.Relation.EQUAL_TO), new ScoreDoc[] { new ScoreDoc(42, 2f) }); + TopDocs topDocs2 = new TopDocs(new TotalHits(1, TotalHits.Relation.EQUAL_TO), new ScoreDoc[] { new ScoreDoc(42, 2f) }); + TopDocs topDocs3 = new TopDocs(new TotalHits(1, TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO), new ScoreDoc[] { new ScoreDoc(42, 2f) }); + TopDocs topDocs4 = new TopDocs(new TotalHits(3, TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO), new ScoreDoc[] { new ScoreDoc(42, 2f) }); TopDocs merged1 = TopDocs.merge(1, new TopDocs[] {topDocs1, topDocs2}); assertEquals(new TotalHits(3, TotalHits.Relation.EQUAL_TO), merged1.totalHits); diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/ShardSearchingTestBase.java b/lucene/test-framework/src/java/org/apache/lucene/search/ShardSearchingTestBase.java index adf881889df..27d4e6c34f7 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/search/ShardSearchingTestBase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/search/ShardSearchingTestBase.java @@ -347,10 +347,6 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase { } else { shardHits[nodeID] = searchNode(nodeID, nodeVersions, query, null, numHits, null); } - - for (int i = 0; i < shardHits[nodeID].scoreDocs.length; i++) { - shardHits[nodeID].scoreDocs[i].shardIndex = nodeID; - } } // Merge: @@ -405,10 +401,6 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase { } else { shardHits[nodeID] = searchNode(nodeID, nodeVersions, query, null, numHits, shardAfter); } - - for (int i = 0; i < shardHits[nodeID].scoreDocs.length; i++) { - shardHits[nodeID].scoreDocs[i].shardIndex = nodeID; - } //System.out.println(" node=" + nodeID + " totHits=" + shardHits[nodeID].totalHits); } @@ -432,10 +424,6 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase { } else { shardHits[nodeID] = (TopFieldDocs) searchNode(nodeID, nodeVersions, query, sort, numHits, null); } - - for (int i = 0; i < shardHits[nodeID].scoreDocs.length; i++) { - shardHits[nodeID].scoreDocs[i].shardIndex = nodeID; - } } // Merge: