mirror of https://github.com/apache/lucene.git
LUCENE-8060: IndexSearcher's search and searchAfter methods now only compute total hit counts accurately up to 1,000.
This commit is contained in:
parent
0dc124aa78
commit
99dbe93681
|
@ -96,6 +96,10 @@ Changes in Runtime Behavior
|
||||||
* LUCENE-7444: StandardAnalyzer no longer defaults to removing English stopwords
|
* LUCENE-7444: StandardAnalyzer no longer defaults to removing English stopwords
|
||||||
(Alan Woodward)
|
(Alan Woodward)
|
||||||
|
|
||||||
|
* LUCENE-8060: IndexSearcher's search and searchAfter methods now only compute
|
||||||
|
total hit counts accurately up to 1,000 in order to enable top-hits
|
||||||
|
optimizations such as block-max WAND (LUCENE-8135). (Adrien Grand)
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
|
|
||||||
* LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.
|
* LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.
|
||||||
|
|
|
@ -90,10 +90,12 @@ request in order to only compute scores for documents that made it to the top
|
||||||
hits. As a consequence, the trackDocScores option has been removed and can be
|
hits. As a consequence, the trackDocScores option has been removed and can be
|
||||||
replaced with the new TopFieldCollector#populateScores helper method.
|
replaced with the new TopFieldCollector#populateScores helper method.
|
||||||
|
|
||||||
## TopDocs.totalHits is no longer a long ##
|
## IndexSearcher.search(After) may return lower bounds of the hit count and TopDocs.totalHits is no longer a long ##
|
||||||
|
|
||||||
Lucene 8 received optimizations for collection of top-k matches by not visiting
|
Lucene 8 received optimizations for collection of top-k matches by not visiting
|
||||||
all matches. However these optimizations won't help if all matches still need
|
all matches. However these optimizations won't help if all matches still need
|
||||||
to be visited in order to compute the total number of hits. As a consequence,
|
to be visited in order to compute the total number of hits. As a consequence,
|
||||||
TopDocs.totalHits is now an TotalHits object that is either an exact hit count
|
IndexSearcher's search and searchAfter methods were changed to only count hits
|
||||||
or a lower bound of the hit count.
|
accurately up to 1,000, and Topdocs.totalHits was changed from a long to an
|
||||||
|
object that says whether the hit count is accurate or a lower bound of the
|
||||||
|
actual hit count.
|
||||||
|
|
|
@ -64,6 +64,18 @@ import org.apache.lucene.util.ThreadInterruptedException;
|
||||||
* Once you have a new {@link IndexReader}, it's relatively
|
* Once you have a new {@link IndexReader}, it's relatively
|
||||||
* cheap to create a new IndexSearcher from it.
|
* cheap to create a new IndexSearcher from it.
|
||||||
*
|
*
|
||||||
|
* <p><b>NOTE</b>: The {@link #search} and {@link #searchAfter} methods are
|
||||||
|
* configured to only count top hits accurately up to {@code 1,000} and may
|
||||||
|
* return a {@link TotalHits.Relation lower bound} of the hit count if the
|
||||||
|
* hit count is greater than or equal to {@code 1,000}. On queries that match
|
||||||
|
* lots of documents, counting the number of hits may take much longer than
|
||||||
|
* computing the top hits so this trade-off allows to get some minimal
|
||||||
|
* information about the hit count without slowing down search too much. The
|
||||||
|
* {@link TopDocs#scoreDocs} array is always accurate however. If this behavior
|
||||||
|
* doesn't suit your needs, you should create collectors manually with either
|
||||||
|
* {@link TopScoreDocCollector#create} or {@link TopFieldCollector#create} and
|
||||||
|
* call {@link #search(Query, Collector)}.
|
||||||
|
*
|
||||||
* <a name="thread-safety"></a><p><b>NOTE</b>: <code>{@link
|
* <a name="thread-safety"></a><p><b>NOTE</b>: <code>{@link
|
||||||
* IndexSearcher}</code> instances are completely
|
* IndexSearcher}</code> instances are completely
|
||||||
* thread safe, meaning multiple threads can call any of its
|
* thread safe, meaning multiple threads can call any of its
|
||||||
|
@ -82,6 +94,11 @@ public class IndexSearcher {
|
||||||
final long maxRamBytesUsed = Math.min(1L << 25, Runtime.getRuntime().maxMemory() / 20);
|
final long maxRamBytesUsed = Math.min(1L << 25, Runtime.getRuntime().maxMemory() / 20);
|
||||||
DEFAULT_QUERY_CACHE = new LRUQueryCache(maxCachedQueries, maxRamBytesUsed);
|
DEFAULT_QUERY_CACHE = new LRUQueryCache(maxCachedQueries, maxRamBytesUsed);
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
|
* By default we count hits accurately up to 1000. This makes sure that we
|
||||||
|
* don't spend most time on computing hit counts
|
||||||
|
*/
|
||||||
|
private static final int TOTAL_HITS_THRESHOLD = 1000;
|
||||||
|
|
||||||
final IndexReader reader; // package private for testing!
|
final IndexReader reader; // package private for testing!
|
||||||
|
|
||||||
|
@ -384,7 +401,7 @@ public class IndexSearcher {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TopScoreDocCollector newCollector() throws IOException {
|
public TopScoreDocCollector newCollector() throws IOException {
|
||||||
return TopScoreDocCollector.create(cappedNumHits, after, Integer.MAX_VALUE);
|
return TopScoreDocCollector.create(cappedNumHits, after, TOTAL_HITS_THRESHOLD);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -513,7 +530,7 @@ public class IndexSearcher {
|
||||||
@Override
|
@Override
|
||||||
public TopFieldCollector newCollector() throws IOException {
|
public TopFieldCollector newCollector() throws IOException {
|
||||||
// TODO: don't pay the price for accurate hit counts by default
|
// TODO: don't pay the price for accurate hit counts by default
|
||||||
return TopFieldCollector.create(rewrittenSort, cappedNumHits, after, Integer.MAX_VALUE);
|
return TopFieldCollector.create(rewrittenSort, cappedNumHits, after, TOTAL_HITS_THRESHOLD);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -255,7 +255,7 @@ public class TestBoolean2 extends LuceneTestCase {
|
||||||
|
|
||||||
// sanity check expected num matches in bigSearcher
|
// sanity check expected num matches in bigSearcher
|
||||||
assertEquals(mulFactor * collector.totalHits,
|
assertEquals(mulFactor * collector.totalHits,
|
||||||
bigSearcher.search(query, 1).totalHits.value);
|
bigSearcher.count(query));
|
||||||
|
|
||||||
// now check 2 diff scorers from the bigSearcher as well
|
// now check 2 diff scorers from the bigSearcher as well
|
||||||
collector = TopScoreDocCollector.create(topDocsToCheck, Integer.MAX_VALUE);
|
collector = TopScoreDocCollector.create(topDocsToCheck, Integer.MAX_VALUE);
|
||||||
|
@ -398,8 +398,7 @@ public class TestBoolean2 extends LuceneTestCase {
|
||||||
BooleanQuery.Builder q3 = new BooleanQuery.Builder();
|
BooleanQuery.Builder q3 = new BooleanQuery.Builder();
|
||||||
q3.add(q1, BooleanClause.Occur.SHOULD);
|
q3.add(q1, BooleanClause.Occur.SHOULD);
|
||||||
q3.add(new PrefixQuery(new Term("field2", "b")), BooleanClause.Occur.SHOULD);
|
q3.add(new PrefixQuery(new Term("field2", "b")), BooleanClause.Occur.SHOULD);
|
||||||
TopDocs hits4 = bigSearcher.search(q3.build(), 1);
|
assertEquals(mulFactor*collector.totalHits + NUM_EXTRA_DOCS/2, bigSearcher.count(q3.build()));
|
||||||
assertEquals(mulFactor*collector.totalHits + NUM_EXTRA_DOCS/2, hits4.totalHits.value);
|
|
||||||
|
|
||||||
// test diff (randomized) scorers produce the same results on bigSearcher as well
|
// test diff (randomized) scorers produce the same results on bigSearcher as well
|
||||||
collector = TopFieldCollector.create(sort, 1000 * mulFactor, 1);
|
collector = TopFieldCollector.create(sort, 1000 * mulFactor, 1);
|
||||||
|
|
|
@ -149,7 +149,7 @@ public class TestBooleanScorer extends LuceneTestCase {
|
||||||
q2.add(q1.build(), BooleanClause.Occur.SHOULD);
|
q2.add(q1.build(), BooleanClause.Occur.SHOULD);
|
||||||
q2.add(new CrazyMustUseBulkScorerQuery(), BooleanClause.Occur.SHOULD);
|
q2.add(new CrazyMustUseBulkScorerQuery(), BooleanClause.Occur.SHOULD);
|
||||||
|
|
||||||
assertEquals(1, s.search(q2.build(), 10).totalHits.value);
|
assertEquals(1, s.count(q2.build()));
|
||||||
r.close();
|
r.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
|
@ -148,7 +148,13 @@ public class TestLRUQueryCache extends LuceneTestCase {
|
||||||
TotalHitCountCollector collector = new TotalHitCountCollector();
|
TotalHitCountCollector collector = new TotalHitCountCollector();
|
||||||
searcher.search(q, collector); // will use the cache
|
searcher.search(q, collector); // will use the cache
|
||||||
final int totalHits1 = collector.getTotalHits();
|
final int totalHits1 = collector.getTotalHits();
|
||||||
final long totalHits2 = searcher.search(q, 1).totalHits.value; // will not use the cache because of scores
|
TotalHitCountCollector collector2 = new TotalHitCountCollector();
|
||||||
|
searcher.search(q, new FilterCollector(collector2) {
|
||||||
|
public ScoreMode scoreMode() {
|
||||||
|
return ScoreMode.COMPLETE; // will not use the cache because of scores
|
||||||
|
}
|
||||||
|
});
|
||||||
|
final long totalHits2 = collector2.getTotalHits();
|
||||||
assertEquals(totalHits2, totalHits1);
|
assertEquals(totalHits2, totalHits1);
|
||||||
} finally {
|
} finally {
|
||||||
mgr.release(searcher);
|
mgr.release(searcher);
|
||||||
|
|
|
@ -62,7 +62,7 @@ public class TestNeedsScores extends LuceneTestCase {
|
||||||
Query required = new TermQuery(new Term("field", "this"));
|
Query required = new TermQuery(new Term("field", "this"));
|
||||||
Query prohibited = new TermQuery(new Term("field", "3"));
|
Query prohibited = new TermQuery(new Term("field", "3"));
|
||||||
BooleanQuery.Builder bq = new BooleanQuery.Builder();
|
BooleanQuery.Builder bq = new BooleanQuery.Builder();
|
||||||
bq.add(new AssertNeedsScores(required, ScoreMode.COMPLETE), BooleanClause.Occur.MUST);
|
bq.add(new AssertNeedsScores(required, ScoreMode.TOP_SCORES), BooleanClause.Occur.MUST);
|
||||||
bq.add(new AssertNeedsScores(prohibited, ScoreMode.COMPLETE_NO_SCORES), BooleanClause.Occur.MUST_NOT);
|
bq.add(new AssertNeedsScores(prohibited, ScoreMode.COMPLETE_NO_SCORES), BooleanClause.Occur.MUST_NOT);
|
||||||
assertEquals(4, searcher.search(bq.build(), 5).totalHits.value); // we exclude 3
|
assertEquals(4, searcher.search(bq.build(), 5).totalHits.value); // we exclude 3
|
||||||
}
|
}
|
||||||
|
|
|
@ -384,7 +384,7 @@ public class TestShardSearching extends ShardSearchingTestBase {
|
||||||
sd.doc += base[sd.shardIndex];
|
sd.doc += base[sd.shardIndex];
|
||||||
}
|
}
|
||||||
|
|
||||||
TestUtil.assertEquals(hits, shardHits);
|
TestUtil.assertConsistent(hits, shardHits);
|
||||||
|
|
||||||
if (moreHits) {
|
if (moreHits) {
|
||||||
// Return a continuation:
|
// Return a continuation:
|
||||||
|
|
|
@ -372,7 +372,7 @@ public class TestTopDocsMerge extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TestUtil.assertEquals(topHits, mergedHits);
|
TestUtil.assertConsistent(topHits, mergedHits);
|
||||||
}
|
}
|
||||||
reader.close();
|
reader.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
|
|
|
@ -311,14 +311,8 @@ public class ToParentBlockJoinQuery extends Query {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public float getMaxScore(int upTo) throws IOException {
|
public float getMaxScore(int upTo) throws IOException {
|
||||||
switch(scoreMode) {
|
|
||||||
case Max:
|
|
||||||
case Min:
|
|
||||||
return childScorer.getMaxScore(DocIdSetIterator.NO_MORE_DOCS);
|
|
||||||
default:
|
|
||||||
return Float.POSITIVE_INFINITY;
|
return Float.POSITIVE_INFINITY;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
private void setScoreAndFreq() throws IOException {
|
private void setScoreAndFreq() throws IOException {
|
||||||
if (childApproximation.docID() >= parentApproximation.docID()) {
|
if (childApproximation.docID() >= parentApproximation.docID()) {
|
||||||
|
|
|
@ -92,6 +92,7 @@ import org.apache.lucene.mockfile.WindowsFS;
|
||||||
import org.apache.lucene.search.FieldDoc;
|
import org.apache.lucene.search.FieldDoc;
|
||||||
import org.apache.lucene.search.ScoreDoc;
|
import org.apache.lucene.search.ScoreDoc;
|
||||||
import org.apache.lucene.search.TopDocs;
|
import org.apache.lucene.search.TopDocs;
|
||||||
|
import org.apache.lucene.search.TotalHits;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.FSDirectory;
|
import org.apache.lucene.store.FSDirectory;
|
||||||
import org.apache.lucene.store.FilterDirectory;
|
import org.apache.lucene.store.FilterDirectory;
|
||||||
|
@ -1040,9 +1041,20 @@ public final class TestUtil {
|
||||||
Assert.assertEquals("Reflection does not produce same map", reflectedValues, map);
|
Assert.assertEquals("Reflection does not produce same map", reflectedValues, map);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void assertEquals(TopDocs expected, TopDocs actual) {
|
/**
|
||||||
|
* Assert that the given {@link TopDocs} have the same top docs and consistent hit counts.
|
||||||
|
*/
|
||||||
|
public static void assertConsistent(TopDocs expected, TopDocs actual) {
|
||||||
|
Assert.assertEquals("wrong total hits", expected.totalHits.value == 0, actual.totalHits.value == 0);
|
||||||
|
if (expected.totalHits.relation == TotalHits.Relation.EQUAL_TO) {
|
||||||
|
if (actual.totalHits.relation == TotalHits.Relation.EQUAL_TO) {
|
||||||
Assert.assertEquals("wrong total hits", expected.totalHits.value, actual.totalHits.value);
|
Assert.assertEquals("wrong total hits", expected.totalHits.value, actual.totalHits.value);
|
||||||
Assert.assertEquals("wrong total hits", expected.totalHits.relation, actual.totalHits.relation);
|
} else {
|
||||||
|
Assert.assertTrue("wrong total hits", expected.totalHits.value >= actual.totalHits.value);
|
||||||
|
}
|
||||||
|
} else if (actual.totalHits.relation == TotalHits.Relation.EQUAL_TO) {
|
||||||
|
Assert.assertTrue("wrong total hits", expected.totalHits.value <= actual.totalHits.value);
|
||||||
|
}
|
||||||
Assert.assertEquals("wrong hit count", expected.scoreDocs.length, actual.scoreDocs.length);
|
Assert.assertEquals("wrong hit count", expected.scoreDocs.length, actual.scoreDocs.length);
|
||||||
for(int hitIDX=0;hitIDX<expected.scoreDocs.length;hitIDX++) {
|
for(int hitIDX=0;hitIDX<expected.scoreDocs.length;hitIDX++) {
|
||||||
final ScoreDoc expectedSD = expected.scoreDocs[hitIDX];
|
final ScoreDoc expectedSD = expected.scoreDocs[hitIDX];
|
||||||
|
|
Loading…
Reference in New Issue