mirror of https://github.com/apache/lucene.git
LUCENE-8875: Introduce Optimized Collector For Large Number Of Hits (#754)
This commit introduces a new collector which is optimized for cases when the number of hits is large and/or the actual hits collected are sparse in comparison to the number of hits requested.
This commit is contained in:
parent
fb30ded643
commit
ee79a20174
|
@ -143,6 +143,9 @@ Improvements
|
|||
* LUCENE-8845: Allow Intervals.prefix() and Intervals.wildcard() to specify
|
||||
their maximum allowed expansions (Alan Woodward)
|
||||
|
||||
* LUCENE-8875: Introduce a Collector optimized for use cases when large
|
||||
number of hits are requested (Atri Sharma)
|
||||
|
||||
* LUCENE-8848 LUCENE-7757 LUCENE-8492: The UnifiedHighlighter now detects that parts of the query are not understood by
|
||||
it, and thus it should not make optimizations that result in no highlights or slow highlighting. This generally works
|
||||
best for WEIGHT_MATCHES mode. Consequently queries produced by ComplexPhraseQueryParser and the surround QueryParser
|
||||
|
|
|
@ -0,0 +1,157 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.search;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
|
||||
import static org.apache.lucene.search.TopDocsCollector.EMPTY_TOPDOCS;
|
||||
|
||||
/**
|
||||
* Optimized collector for large number of hits.
|
||||
* The collector maintains an ArrayList of hits until it accumulates
|
||||
* the requested number of hits. Post that, it builds a Priority Queue
|
||||
* and starts filtering further hits based on the minimum competitive
|
||||
* score.
|
||||
*/
|
||||
public final class LargeNumHitsTopDocsCollector implements Collector {
|
||||
private final int requestedHitCount;
|
||||
private List<ScoreDoc> hits = new ArrayList<>();
|
||||
// package private for testing
|
||||
HitQueue pq;
|
||||
ScoreDoc pqTop;
|
||||
int totalHits;
|
||||
|
||||
public LargeNumHitsTopDocsCollector(int requestedHitCount) {
|
||||
this.requestedHitCount = requestedHitCount;
|
||||
this.totalHits = 0;
|
||||
}
|
||||
|
||||
// We always return COMPLETE since this collector should ideally
|
||||
// be used only with large number of hits case
|
||||
@Override
|
||||
public ScoreMode scoreMode() {
|
||||
return ScoreMode.COMPLETE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public LeafCollector getLeafCollector(LeafReaderContext context) {
|
||||
final int docBase = context.docBase;
|
||||
return new TopScoreDocCollector.ScorerLeafCollector() {
|
||||
|
||||
@Override
|
||||
public void setScorer(Scorable scorer) throws IOException {
|
||||
super.setScorer(scorer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc) throws IOException {
|
||||
float score = scorer.score();
|
||||
|
||||
// This collector relies on the fact that scorers produce positive values:
|
||||
assert score >= 0; // NOTE: false for NaN
|
||||
|
||||
if (totalHits < requestedHitCount) {
|
||||
hits.add(new ScoreDoc(doc + docBase, score));
|
||||
totalHits++;
|
||||
return;
|
||||
} else if (totalHits == requestedHitCount) {
|
||||
// Convert the list to a priority queue
|
||||
|
||||
// We should get here only when priority queue
|
||||
// has not been built
|
||||
assert pq == null;
|
||||
assert pqTop == null;
|
||||
pq = new HitQueue(requestedHitCount, false);
|
||||
|
||||
for (ScoreDoc scoreDoc : hits) {
|
||||
pq.add(scoreDoc);
|
||||
}
|
||||
|
||||
pqTop = pq.top();
|
||||
hits = null;
|
||||
}
|
||||
|
||||
if (score > pqTop.score) {
|
||||
pqTop.doc = doc + docBase;
|
||||
pqTop.score = score;
|
||||
pqTop = pq.updateTop();
|
||||
}
|
||||
++totalHits;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/** Returns the top docs that were collected by this collector. */
|
||||
public TopDocs topDocs(int howMany) {
|
||||
|
||||
if (howMany <= 0 || howMany > totalHits) {
|
||||
throw new IllegalArgumentException("Incorrect number of hits requested");
|
||||
}
|
||||
|
||||
ScoreDoc[] results = new ScoreDoc[howMany];
|
||||
|
||||
// Get the requested results from either hits list or PQ
|
||||
populateResults(results, howMany);
|
||||
|
||||
return newTopDocs(results);
|
||||
}
|
||||
|
||||
/**
|
||||
* Populates the results array with the ScoreDoc instances. This can be
|
||||
* overridden in case a different ScoreDoc type should be returned.
|
||||
*/
|
||||
protected void populateResults(ScoreDoc[] results, int howMany) {
|
||||
if (pq != null) {
|
||||
assert totalHits >= requestedHitCount;
|
||||
for (int i = howMany - 1; i >= 0; i--) {
|
||||
results[i] = pq.pop();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Total number of hits collected were less than requestedHitCount
|
||||
assert totalHits < requestedHitCount;
|
||||
Collections.sort(hits, Comparator.comparing((ScoreDoc scoreDoc) ->
|
||||
scoreDoc.score).reversed().thenComparing(scoreDoc -> scoreDoc.doc));
|
||||
|
||||
for (int i = 0; i < howMany; i++) {
|
||||
results[i] = hits.get(i);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a {@link TopDocs} instance containing the given results. If
|
||||
* <code>results</code> is null it means there are no results to return,
|
||||
* either because there were 0 calls to collect() or because the arguments to
|
||||
* topDocs were invalid.
|
||||
*/
|
||||
protected TopDocs newTopDocs(ScoreDoc[] results) {
|
||||
return results == null ? EMPTY_TOPDOCS : new TopDocs(new TotalHits(totalHits, TotalHits.Relation.EQUAL_TO), results);
|
||||
}
|
||||
|
||||
/** Returns the top docs that were collected by this collector. */
|
||||
public TopDocs topDocs() {
|
||||
return topDocs(Math.min(totalHits, requestedHitCount));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,158 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.search;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class TestLargeNumHitsTopDocsCollector extends LuceneTestCase {
|
||||
private Directory dir;
|
||||
private IndexReader reader;
|
||||
private final Query testQuery = new BooleanQuery.Builder()
|
||||
.add(new TermQuery(new Term("field", "5")), BooleanClause.Occur.SHOULD)
|
||||
.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD)
|
||||
.build();
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
dir = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||
for (int i = 0; i < 1_000; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(newStringField("field", "5", Field.Store.NO));
|
||||
writer.addDocument(doc);
|
||||
writer.addDocument(new Document());
|
||||
}
|
||||
reader = writer.getReader();
|
||||
writer.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
reader.close();
|
||||
dir.close();
|
||||
dir = null;
|
||||
super.tearDown();
|
||||
}
|
||||
public void testRequestMoreHitsThanCollected() throws Exception {
|
||||
runNumHits(150);
|
||||
}
|
||||
|
||||
public void testSingleNumHit() throws Exception {
|
||||
runNumHits(1);
|
||||
}
|
||||
|
||||
public void testRequestLessHitsThanCollected() throws Exception {
|
||||
runNumHits(25);
|
||||
}
|
||||
|
||||
public void testIllegalArguments() throws IOException {
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
LargeNumHitsTopDocsCollector largeCollector = new LargeNumHitsTopDocsCollector(15);
|
||||
TopScoreDocCollector regularCollector = TopScoreDocCollector.create(15, null, Integer.MAX_VALUE);
|
||||
|
||||
searcher.search(testQuery, largeCollector);
|
||||
searcher.search(testQuery, regularCollector);
|
||||
|
||||
assertEquals(largeCollector.totalHits, regularCollector.totalHits);
|
||||
|
||||
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
|
||||
largeCollector.topDocs(350_000);
|
||||
});
|
||||
|
||||
assertTrue(expected.getMessage().contains("Incorrect number of hits requested"));
|
||||
}
|
||||
|
||||
public void testNoPQBuild() throws IOException {
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
LargeNumHitsTopDocsCollector largeCollector = new LargeNumHitsTopDocsCollector(250_000);
|
||||
TopScoreDocCollector regularCollector = TopScoreDocCollector.create(250_000, null, Integer.MAX_VALUE);
|
||||
|
||||
searcher.search(testQuery, largeCollector);
|
||||
searcher.search(testQuery, regularCollector);
|
||||
|
||||
assertEquals(largeCollector.totalHits, regularCollector.totalHits);
|
||||
|
||||
assertEquals(largeCollector.pq, null);
|
||||
assertEquals(largeCollector.pqTop, null);
|
||||
}
|
||||
|
||||
public void testPQBuild() throws IOException {
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
LargeNumHitsTopDocsCollector largeCollector = new LargeNumHitsTopDocsCollector(50);
|
||||
TopScoreDocCollector regularCollector = TopScoreDocCollector.create(50, null, Integer.MAX_VALUE);
|
||||
|
||||
searcher.search(testQuery, largeCollector);
|
||||
searcher.search(testQuery, regularCollector);
|
||||
|
||||
assertEquals(largeCollector.totalHits, regularCollector.totalHits);
|
||||
|
||||
assertNotEquals(largeCollector.pq, null);
|
||||
assertNotEquals(largeCollector.pqTop, null);
|
||||
}
|
||||
|
||||
public void testNoPQHitsOrder() throws IOException {
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
LargeNumHitsTopDocsCollector largeCollector = new LargeNumHitsTopDocsCollector(250_000);
|
||||
TopScoreDocCollector regularCollector = TopScoreDocCollector.create(250_000, null, Integer.MAX_VALUE);
|
||||
|
||||
searcher.search(testQuery, largeCollector);
|
||||
searcher.search(testQuery, regularCollector);
|
||||
|
||||
assertEquals(largeCollector.totalHits, regularCollector.totalHits);
|
||||
|
||||
assertEquals(largeCollector.pq, null);
|
||||
assertEquals(largeCollector.pqTop, null);
|
||||
|
||||
TopDocs topDocs = largeCollector.topDocs();
|
||||
|
||||
if (topDocs.scoreDocs.length > 0) {
|
||||
float preScore = topDocs.scoreDocs[0].score;
|
||||
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
|
||||
assert scoreDoc.score <= preScore;
|
||||
preScore = scoreDoc.score;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void runNumHits(int numHits) throws IOException {
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
LargeNumHitsTopDocsCollector largeCollector = new LargeNumHitsTopDocsCollector(numHits);
|
||||
TopScoreDocCollector regularCollector = TopScoreDocCollector.create(numHits, null, Integer.MAX_VALUE);
|
||||
|
||||
searcher.search(testQuery, largeCollector);
|
||||
searcher.search(testQuery, regularCollector);
|
||||
|
||||
assertEquals(largeCollector.totalHits, regularCollector.totalHits);
|
||||
|
||||
TopDocs firstTopDocs = largeCollector.topDocs();
|
||||
TopDocs secondTopDocs = regularCollector.topDocs();
|
||||
|
||||
assertEquals(firstTopDocs.scoreDocs.length, secondTopDocs.scoreDocs.length);
|
||||
|
||||
CheckHits.checkEqual(testQuery, firstTopDocs.scoreDocs, secondTopDocs.scoreDocs);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue