LUCENE-8060: IndexSearcher's search and searchAfter methods now only compute total hit counts accurately up to 1,000.

This commit is contained in:
Adrien Grand 2018-08-01 09:00:40 +02:00
parent 0dc124aa78
commit 99dbe93681
11 changed files with 58 additions and 24 deletions

View File

@ -96,6 +96,10 @@ Changes in Runtime Behavior
* LUCENE-7444: StandardAnalyzer no longer defaults to removing English stopwords * LUCENE-7444: StandardAnalyzer no longer defaults to removing English stopwords
(Alan Woodward) (Alan Woodward)
* LUCENE-8060: IndexSearcher's search and searchAfter methods now only compute
total hit counts accurately up to 1,000 in order to enable top-hits
optimizations such as block-max WAND (LUCENE-8135). (Adrien Grand)
Improvements Improvements
* LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities. * LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.

View File

@ -90,10 +90,12 @@ request in order to only compute scores for documents that made it to the top
hits. As a consequence, the trackDocScores option has been removed and can be hits. As a consequence, the trackDocScores option has been removed and can be
replaced with the new TopFieldCollector#populateScores helper method. replaced with the new TopFieldCollector#populateScores helper method.
## TopDocs.totalHits is no longer a long ## ## IndexSearcher.search(After) may return lower bounds of the hit count and TopDocs.totalHits is no longer a long ##
Lucene 8 received optimizations for collection of top-k matches by not visiting Lucene 8 received optimizations for collection of top-k matches by not visiting
all matches. However these optimizations won't help if all matches still need all matches. However these optimizations won't help if all matches still need
to be visited in order to compute the total number of hits. As a consequence, to be visited in order to compute the total number of hits. As a consequence,
TopDocs.totalHits is now an TotalHits object that is either an exact hit count IndexSearcher's search and searchAfter methods were changed to only count hits
or a lower bound of the hit count. accurately up to 1,000, and Topdocs.totalHits was changed from a long to an
object that says whether the hit count is accurate or a lower bound of the
actual hit count.

View File

@ -63,7 +63,19 @@ import org.apache.lucene.util.ThreadInterruptedException;
* reader ({@link DirectoryReader#open(IndexWriter)}). * reader ({@link DirectoryReader#open(IndexWriter)}).
* Once you have a new {@link IndexReader}, it's relatively * Once you have a new {@link IndexReader}, it's relatively
* cheap to create a new IndexSearcher from it. * cheap to create a new IndexSearcher from it.
* *
* <p><b>NOTE</b>: The {@link #search} and {@link #searchAfter} methods are
* configured to only count top hits accurately up to {@code 1,000} and may
* return a {@link TotalHits.Relation lower bound} of the hit count if the
* hit count is greater than or equal to {@code 1,000}. On queries that match
* lots of documents, counting the number of hits may take much longer than
* computing the top hits so this trade-off allows to get some minimal
* information about the hit count without slowing down search too much. The
* {@link TopDocs#scoreDocs} array is always accurate however. If this behavior
* doesn't suit your needs, you should create collectors manually with either
* {@link TopScoreDocCollector#create} or {@link TopFieldCollector#create} and
* call {@link #search(Query, Collector)}.
*
* <a name="thread-safety"></a><p><b>NOTE</b>: <code>{@link * <a name="thread-safety"></a><p><b>NOTE</b>: <code>{@link
* IndexSearcher}</code> instances are completely * IndexSearcher}</code> instances are completely
* thread safe, meaning multiple threads can call any of its * thread safe, meaning multiple threads can call any of its
@ -82,6 +94,11 @@ public class IndexSearcher {
final long maxRamBytesUsed = Math.min(1L << 25, Runtime.getRuntime().maxMemory() / 20); final long maxRamBytesUsed = Math.min(1L << 25, Runtime.getRuntime().maxMemory() / 20);
DEFAULT_QUERY_CACHE = new LRUQueryCache(maxCachedQueries, maxRamBytesUsed); DEFAULT_QUERY_CACHE = new LRUQueryCache(maxCachedQueries, maxRamBytesUsed);
} }
/**
* By default we count hits accurately up to 1000. This makes sure that we
* don't spend most time on computing hit counts
*/
private static final int TOTAL_HITS_THRESHOLD = 1000;
final IndexReader reader; // package private for testing! final IndexReader reader; // package private for testing!
@ -384,7 +401,7 @@ public class IndexSearcher {
@Override @Override
public TopScoreDocCollector newCollector() throws IOException { public TopScoreDocCollector newCollector() throws IOException {
return TopScoreDocCollector.create(cappedNumHits, after, Integer.MAX_VALUE); return TopScoreDocCollector.create(cappedNumHits, after, TOTAL_HITS_THRESHOLD);
} }
@Override @Override
@ -513,7 +530,7 @@ public class IndexSearcher {
@Override @Override
public TopFieldCollector newCollector() throws IOException { public TopFieldCollector newCollector() throws IOException {
// TODO: don't pay the price for accurate hit counts by default // TODO: don't pay the price for accurate hit counts by default
return TopFieldCollector.create(rewrittenSort, cappedNumHits, after, Integer.MAX_VALUE); return TopFieldCollector.create(rewrittenSort, cappedNumHits, after, TOTAL_HITS_THRESHOLD);
} }
@Override @Override

View File

@ -255,7 +255,7 @@ public class TestBoolean2 extends LuceneTestCase {
// sanity check expected num matches in bigSearcher // sanity check expected num matches in bigSearcher
assertEquals(mulFactor * collector.totalHits, assertEquals(mulFactor * collector.totalHits,
bigSearcher.search(query, 1).totalHits.value); bigSearcher.count(query));
// now check 2 diff scorers from the bigSearcher as well // now check 2 diff scorers from the bigSearcher as well
collector = TopScoreDocCollector.create(topDocsToCheck, Integer.MAX_VALUE); collector = TopScoreDocCollector.create(topDocsToCheck, Integer.MAX_VALUE);
@ -398,8 +398,7 @@ public class TestBoolean2 extends LuceneTestCase {
BooleanQuery.Builder q3 = new BooleanQuery.Builder(); BooleanQuery.Builder q3 = new BooleanQuery.Builder();
q3.add(q1, BooleanClause.Occur.SHOULD); q3.add(q1, BooleanClause.Occur.SHOULD);
q3.add(new PrefixQuery(new Term("field2", "b")), BooleanClause.Occur.SHOULD); q3.add(new PrefixQuery(new Term("field2", "b")), BooleanClause.Occur.SHOULD);
TopDocs hits4 = bigSearcher.search(q3.build(), 1); assertEquals(mulFactor*collector.totalHits + NUM_EXTRA_DOCS/2, bigSearcher.count(q3.build()));
assertEquals(mulFactor*collector.totalHits + NUM_EXTRA_DOCS/2, hits4.totalHits.value);
// test diff (randomized) scorers produce the same results on bigSearcher as well // test diff (randomized) scorers produce the same results on bigSearcher as well
collector = TopFieldCollector.create(sort, 1000 * mulFactor, 1); collector = TopFieldCollector.create(sort, 1000 * mulFactor, 1);

View File

@ -149,7 +149,7 @@ public class TestBooleanScorer extends LuceneTestCase {
q2.add(q1.build(), BooleanClause.Occur.SHOULD); q2.add(q1.build(), BooleanClause.Occur.SHOULD);
q2.add(new CrazyMustUseBulkScorerQuery(), BooleanClause.Occur.SHOULD); q2.add(new CrazyMustUseBulkScorerQuery(), BooleanClause.Occur.SHOULD);
assertEquals(1, s.search(q2.build(), 10).totalHits.value); assertEquals(1, s.count(q2.build()));
r.close(); r.close();
dir.close(); dir.close();
} }

View File

@ -148,7 +148,13 @@ public class TestLRUQueryCache extends LuceneTestCase {
TotalHitCountCollector collector = new TotalHitCountCollector(); TotalHitCountCollector collector = new TotalHitCountCollector();
searcher.search(q, collector); // will use the cache searcher.search(q, collector); // will use the cache
final int totalHits1 = collector.getTotalHits(); final int totalHits1 = collector.getTotalHits();
final long totalHits2 = searcher.search(q, 1).totalHits.value; // will not use the cache because of scores TotalHitCountCollector collector2 = new TotalHitCountCollector();
searcher.search(q, new FilterCollector(collector2) {
public ScoreMode scoreMode() {
return ScoreMode.COMPLETE; // will not use the cache because of scores
}
});
final long totalHits2 = collector2.getTotalHits();
assertEquals(totalHits2, totalHits1); assertEquals(totalHits2, totalHits1);
} finally { } finally {
mgr.release(searcher); mgr.release(searcher);

View File

@ -62,7 +62,7 @@ public class TestNeedsScores extends LuceneTestCase {
Query required = new TermQuery(new Term("field", "this")); Query required = new TermQuery(new Term("field", "this"));
Query prohibited = new TermQuery(new Term("field", "3")); Query prohibited = new TermQuery(new Term("field", "3"));
BooleanQuery.Builder bq = new BooleanQuery.Builder(); BooleanQuery.Builder bq = new BooleanQuery.Builder();
bq.add(new AssertNeedsScores(required, ScoreMode.COMPLETE), BooleanClause.Occur.MUST); bq.add(new AssertNeedsScores(required, ScoreMode.TOP_SCORES), BooleanClause.Occur.MUST);
bq.add(new AssertNeedsScores(prohibited, ScoreMode.COMPLETE_NO_SCORES), BooleanClause.Occur.MUST_NOT); bq.add(new AssertNeedsScores(prohibited, ScoreMode.COMPLETE_NO_SCORES), BooleanClause.Occur.MUST_NOT);
assertEquals(4, searcher.search(bq.build(), 5).totalHits.value); // we exclude 3 assertEquals(4, searcher.search(bq.build(), 5).totalHits.value); // we exclude 3
} }

View File

@ -384,7 +384,7 @@ public class TestShardSearching extends ShardSearchingTestBase {
sd.doc += base[sd.shardIndex]; sd.doc += base[sd.shardIndex];
} }
TestUtil.assertEquals(hits, shardHits); TestUtil.assertConsistent(hits, shardHits);
if (moreHits) { if (moreHits) {
// Return a continuation: // Return a continuation:

View File

@ -372,7 +372,7 @@ public class TestTopDocsMerge extends LuceneTestCase {
} }
} }
TestUtil.assertEquals(topHits, mergedHits); TestUtil.assertConsistent(topHits, mergedHits);
} }
reader.close(); reader.close();
dir.close(); dir.close();

View File

@ -311,13 +311,7 @@ public class ToParentBlockJoinQuery extends Query {
@Override @Override
public float getMaxScore(int upTo) throws IOException { public float getMaxScore(int upTo) throws IOException {
switch(scoreMode) { return Float.POSITIVE_INFINITY;
case Max:
case Min:
return childScorer.getMaxScore(DocIdSetIterator.NO_MORE_DOCS);
default:
return Float.POSITIVE_INFINITY;
}
} }
private void setScoreAndFreq() throws IOException { private void setScoreAndFreq() throws IOException {

View File

@ -92,6 +92,7 @@ import org.apache.lucene.mockfile.WindowsFS;
import org.apache.lucene.search.FieldDoc; import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TotalHits;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.FilterDirectory; import org.apache.lucene.store.FilterDirectory;
@ -1040,9 +1041,20 @@ public final class TestUtil {
Assert.assertEquals("Reflection does not produce same map", reflectedValues, map); Assert.assertEquals("Reflection does not produce same map", reflectedValues, map);
} }
public static void assertEquals(TopDocs expected, TopDocs actual) { /**
Assert.assertEquals("wrong total hits", expected.totalHits.value, actual.totalHits.value); * Assert that the given {@link TopDocs} have the same top docs and consistent hit counts.
Assert.assertEquals("wrong total hits", expected.totalHits.relation, actual.totalHits.relation); */
public static void assertConsistent(TopDocs expected, TopDocs actual) {
Assert.assertEquals("wrong total hits", expected.totalHits.value == 0, actual.totalHits.value == 0);
if (expected.totalHits.relation == TotalHits.Relation.EQUAL_TO) {
if (actual.totalHits.relation == TotalHits.Relation.EQUAL_TO) {
Assert.assertEquals("wrong total hits", expected.totalHits.value, actual.totalHits.value);
} else {
Assert.assertTrue("wrong total hits", expected.totalHits.value >= actual.totalHits.value);
}
} else if (actual.totalHits.relation == TotalHits.Relation.EQUAL_TO) {
Assert.assertTrue("wrong total hits", expected.totalHits.value <= actual.totalHits.value);
}
Assert.assertEquals("wrong hit count", expected.scoreDocs.length, actual.scoreDocs.length); Assert.assertEquals("wrong hit count", expected.scoreDocs.length, actual.scoreDocs.length);
for(int hitIDX=0;hitIDX<expected.scoreDocs.length;hitIDX++) { for(int hitIDX=0;hitIDX<expected.scoreDocs.length;hitIDX++) {
final ScoreDoc expectedSD = expected.scoreDocs[hitIDX]; final ScoreDoc expectedSD = expected.scoreDocs[hitIDX];