mirror of https://github.com/apache/lucene.git
LUCENE-8875: Introduce Optimized Collector For Large Number Of Hits (#754)
This commit introduces a new collector which is optimized for cases when the number of hits is large and/or the actual hits collected are sparse in comparison to the number of hits requested.
This commit is contained in:
parent
fb30ded643
commit
ee79a20174
|
@ -143,6 +143,9 @@ Improvements
|
||||||
* LUCENE-8845: Allow Intervals.prefix() and Intervals.wildcard() to specify
|
* LUCENE-8845: Allow Intervals.prefix() and Intervals.wildcard() to specify
|
||||||
their maximum allowed expansions (Alan Woodward)
|
their maximum allowed expansions (Alan Woodward)
|
||||||
|
|
||||||
|
* LUCENE-8875: Introduce a Collector optimized for use cases when large
|
||||||
|
number of hits are requested (Atri Sharma)
|
||||||
|
|
||||||
* LUCENE-8848 LUCENE-7757 LUCENE-8492: The UnifiedHighlighter now detects that parts of the query are not understood by
|
* LUCENE-8848 LUCENE-7757 LUCENE-8492: The UnifiedHighlighter now detects that parts of the query are not understood by
|
||||||
it, and thus it should not make optimizations that result in no highlights or slow highlighting. This generally works
|
it, and thus it should not make optimizations that result in no highlights or slow highlighting. This generally works
|
||||||
best for WEIGHT_MATCHES mode. Consequently queries produced by ComplexPhraseQueryParser and the surround QueryParser
|
best for WEIGHT_MATCHES mode. Consequently queries produced by ComplexPhraseQueryParser and the surround QueryParser
|
||||||
|
|
|
@ -0,0 +1,157 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.search;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
|
|
||||||
|
import static org.apache.lucene.search.TopDocsCollector.EMPTY_TOPDOCS;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Optimized collector for large number of hits.
|
||||||
|
* The collector maintains an ArrayList of hits until it accumulates
|
||||||
|
* the requested number of hits. Post that, it builds a Priority Queue
|
||||||
|
* and starts filtering further hits based on the minimum competitive
|
||||||
|
* score.
|
||||||
|
*/
|
||||||
|
public final class LargeNumHitsTopDocsCollector implements Collector {
|
||||||
|
private final int requestedHitCount;
|
||||||
|
private List<ScoreDoc> hits = new ArrayList<>();
|
||||||
|
// package private for testing
|
||||||
|
HitQueue pq;
|
||||||
|
ScoreDoc pqTop;
|
||||||
|
int totalHits;
|
||||||
|
|
||||||
|
public LargeNumHitsTopDocsCollector(int requestedHitCount) {
|
||||||
|
this.requestedHitCount = requestedHitCount;
|
||||||
|
this.totalHits = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// We always return COMPLETE since this collector should ideally
|
||||||
|
// be used only with large number of hits case
|
||||||
|
@Override
|
||||||
|
public ScoreMode scoreMode() {
|
||||||
|
return ScoreMode.COMPLETE;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public LeafCollector getLeafCollector(LeafReaderContext context) {
|
||||||
|
final int docBase = context.docBase;
|
||||||
|
return new TopScoreDocCollector.ScorerLeafCollector() {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setScorer(Scorable scorer) throws IOException {
|
||||||
|
super.setScorer(scorer);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void collect(int doc) throws IOException {
|
||||||
|
float score = scorer.score();
|
||||||
|
|
||||||
|
// This collector relies on the fact that scorers produce positive values:
|
||||||
|
assert score >= 0; // NOTE: false for NaN
|
||||||
|
|
||||||
|
if (totalHits < requestedHitCount) {
|
||||||
|
hits.add(new ScoreDoc(doc + docBase, score));
|
||||||
|
totalHits++;
|
||||||
|
return;
|
||||||
|
} else if (totalHits == requestedHitCount) {
|
||||||
|
// Convert the list to a priority queue
|
||||||
|
|
||||||
|
// We should get here only when priority queue
|
||||||
|
// has not been built
|
||||||
|
assert pq == null;
|
||||||
|
assert pqTop == null;
|
||||||
|
pq = new HitQueue(requestedHitCount, false);
|
||||||
|
|
||||||
|
for (ScoreDoc scoreDoc : hits) {
|
||||||
|
pq.add(scoreDoc);
|
||||||
|
}
|
||||||
|
|
||||||
|
pqTop = pq.top();
|
||||||
|
hits = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (score > pqTop.score) {
|
||||||
|
pqTop.doc = doc + docBase;
|
||||||
|
pqTop.score = score;
|
||||||
|
pqTop = pq.updateTop();
|
||||||
|
}
|
||||||
|
++totalHits;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the top docs that were collected by this collector. */
|
||||||
|
public TopDocs topDocs(int howMany) {
|
||||||
|
|
||||||
|
if (howMany <= 0 || howMany > totalHits) {
|
||||||
|
throw new IllegalArgumentException("Incorrect number of hits requested");
|
||||||
|
}
|
||||||
|
|
||||||
|
ScoreDoc[] results = new ScoreDoc[howMany];
|
||||||
|
|
||||||
|
// Get the requested results from either hits list or PQ
|
||||||
|
populateResults(results, howMany);
|
||||||
|
|
||||||
|
return newTopDocs(results);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Populates the results array with the ScoreDoc instances. This can be
|
||||||
|
* overridden in case a different ScoreDoc type should be returned.
|
||||||
|
*/
|
||||||
|
protected void populateResults(ScoreDoc[] results, int howMany) {
|
||||||
|
if (pq != null) {
|
||||||
|
assert totalHits >= requestedHitCount;
|
||||||
|
for (int i = howMany - 1; i >= 0; i--) {
|
||||||
|
results[i] = pq.pop();
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Total number of hits collected were less than requestedHitCount
|
||||||
|
assert totalHits < requestedHitCount;
|
||||||
|
Collections.sort(hits, Comparator.comparing((ScoreDoc scoreDoc) ->
|
||||||
|
scoreDoc.score).reversed().thenComparing(scoreDoc -> scoreDoc.doc));
|
||||||
|
|
||||||
|
for (int i = 0; i < howMany; i++) {
|
||||||
|
results[i] = hits.get(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a {@link TopDocs} instance containing the given results. If
|
||||||
|
* <code>results</code> is null it means there are no results to return,
|
||||||
|
* either because there were 0 calls to collect() or because the arguments to
|
||||||
|
* topDocs were invalid.
|
||||||
|
*/
|
||||||
|
protected TopDocs newTopDocs(ScoreDoc[] results) {
|
||||||
|
return results == null ? EMPTY_TOPDOCS : new TopDocs(new TotalHits(totalHits, TotalHits.Relation.EQUAL_TO), results);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the top docs that were collected by this collector. */
|
||||||
|
public TopDocs topDocs() {
|
||||||
|
return topDocs(Math.min(totalHits, requestedHitCount));
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,158 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.search;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.RandomIndexWriter;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
public class TestLargeNumHitsTopDocsCollector extends LuceneTestCase {
|
||||||
|
private Directory dir;
|
||||||
|
private IndexReader reader;
|
||||||
|
private final Query testQuery = new BooleanQuery.Builder()
|
||||||
|
.add(new TermQuery(new Term("field", "5")), BooleanClause.Occur.SHOULD)
|
||||||
|
.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
super.setUp();
|
||||||
|
dir = newDirectory();
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||||
|
for (int i = 0; i < 1_000; i++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(newStringField("field", "5", Field.Store.NO));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
writer.addDocument(new Document());
|
||||||
|
}
|
||||||
|
reader = writer.getReader();
|
||||||
|
writer.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void tearDown() throws Exception {
|
||||||
|
reader.close();
|
||||||
|
dir.close();
|
||||||
|
dir = null;
|
||||||
|
super.tearDown();
|
||||||
|
}
|
||||||
|
public void testRequestMoreHitsThanCollected() throws Exception {
|
||||||
|
runNumHits(150);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSingleNumHit() throws Exception {
|
||||||
|
runNumHits(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRequestLessHitsThanCollected() throws Exception {
|
||||||
|
runNumHits(25);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testIllegalArguments() throws IOException {
|
||||||
|
IndexSearcher searcher = newSearcher(reader);
|
||||||
|
LargeNumHitsTopDocsCollector largeCollector = new LargeNumHitsTopDocsCollector(15);
|
||||||
|
TopScoreDocCollector regularCollector = TopScoreDocCollector.create(15, null, Integer.MAX_VALUE);
|
||||||
|
|
||||||
|
searcher.search(testQuery, largeCollector);
|
||||||
|
searcher.search(testQuery, regularCollector);
|
||||||
|
|
||||||
|
assertEquals(largeCollector.totalHits, regularCollector.totalHits);
|
||||||
|
|
||||||
|
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
|
||||||
|
largeCollector.topDocs(350_000);
|
||||||
|
});
|
||||||
|
|
||||||
|
assertTrue(expected.getMessage().contains("Incorrect number of hits requested"));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testNoPQBuild() throws IOException {
|
||||||
|
IndexSearcher searcher = newSearcher(reader);
|
||||||
|
LargeNumHitsTopDocsCollector largeCollector = new LargeNumHitsTopDocsCollector(250_000);
|
||||||
|
TopScoreDocCollector regularCollector = TopScoreDocCollector.create(250_000, null, Integer.MAX_VALUE);
|
||||||
|
|
||||||
|
searcher.search(testQuery, largeCollector);
|
||||||
|
searcher.search(testQuery, regularCollector);
|
||||||
|
|
||||||
|
assertEquals(largeCollector.totalHits, regularCollector.totalHits);
|
||||||
|
|
||||||
|
assertEquals(largeCollector.pq, null);
|
||||||
|
assertEquals(largeCollector.pqTop, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testPQBuild() throws IOException {
|
||||||
|
IndexSearcher searcher = newSearcher(reader);
|
||||||
|
LargeNumHitsTopDocsCollector largeCollector = new LargeNumHitsTopDocsCollector(50);
|
||||||
|
TopScoreDocCollector regularCollector = TopScoreDocCollector.create(50, null, Integer.MAX_VALUE);
|
||||||
|
|
||||||
|
searcher.search(testQuery, largeCollector);
|
||||||
|
searcher.search(testQuery, regularCollector);
|
||||||
|
|
||||||
|
assertEquals(largeCollector.totalHits, regularCollector.totalHits);
|
||||||
|
|
||||||
|
assertNotEquals(largeCollector.pq, null);
|
||||||
|
assertNotEquals(largeCollector.pqTop, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testNoPQHitsOrder() throws IOException {
|
||||||
|
IndexSearcher searcher = newSearcher(reader);
|
||||||
|
LargeNumHitsTopDocsCollector largeCollector = new LargeNumHitsTopDocsCollector(250_000);
|
||||||
|
TopScoreDocCollector regularCollector = TopScoreDocCollector.create(250_000, null, Integer.MAX_VALUE);
|
||||||
|
|
||||||
|
searcher.search(testQuery, largeCollector);
|
||||||
|
searcher.search(testQuery, regularCollector);
|
||||||
|
|
||||||
|
assertEquals(largeCollector.totalHits, regularCollector.totalHits);
|
||||||
|
|
||||||
|
assertEquals(largeCollector.pq, null);
|
||||||
|
assertEquals(largeCollector.pqTop, null);
|
||||||
|
|
||||||
|
TopDocs topDocs = largeCollector.topDocs();
|
||||||
|
|
||||||
|
if (topDocs.scoreDocs.length > 0) {
|
||||||
|
float preScore = topDocs.scoreDocs[0].score;
|
||||||
|
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
|
||||||
|
assert scoreDoc.score <= preScore;
|
||||||
|
preScore = scoreDoc.score;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void runNumHits(int numHits) throws IOException {
|
||||||
|
IndexSearcher searcher = newSearcher(reader);
|
||||||
|
LargeNumHitsTopDocsCollector largeCollector = new LargeNumHitsTopDocsCollector(numHits);
|
||||||
|
TopScoreDocCollector regularCollector = TopScoreDocCollector.create(numHits, null, Integer.MAX_VALUE);
|
||||||
|
|
||||||
|
searcher.search(testQuery, largeCollector);
|
||||||
|
searcher.search(testQuery, regularCollector);
|
||||||
|
|
||||||
|
assertEquals(largeCollector.totalHits, regularCollector.totalHits);
|
||||||
|
|
||||||
|
TopDocs firstTopDocs = largeCollector.topDocs();
|
||||||
|
TopDocs secondTopDocs = regularCollector.topDocs();
|
||||||
|
|
||||||
|
assertEquals(firstTopDocs.scoreDocs.length, secondTopDocs.scoreDocs.length);
|
||||||
|
|
||||||
|
CheckHits.checkEqual(testQuery, firstTopDocs.scoreDocs, secondTopDocs.scoreDocs);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue