mirror of https://github.com/apache/lucene.git
LUCENE-6850: Optimize BooleanScorer for sparse clauses.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1710591 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b3ee21b7e9
commit
915992b088
|
@ -156,6 +156,8 @@ Optimizations
|
|||
strings larger than 64kb by an amount equal to string's utf8 size.
|
||||
(Dawid Weiss, Robert Muir, shalin)
|
||||
|
||||
* LUCENE-6850: Optimize BooleanScorer for sparse clauses. (Adrien Grand)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-6817: ComplexPhraseQueryParser.ComplexPhraseQuery does not display
|
||||
|
|
|
@ -83,11 +83,11 @@ final class BooleanScorer extends BulkScorer {
|
|||
}
|
||||
|
||||
void advance(int min) throws IOException {
|
||||
score(null, min, min);
|
||||
score(orCollector, null, min, min);
|
||||
}
|
||||
|
||||
void score(Bits acceptDocs, int min, int max) throws IOException {
|
||||
next = scorer.score(orCollector, acceptDocs, min, max);
|
||||
void score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
|
||||
next = scorer.score(collector, acceptDocs, min, max);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -179,6 +179,9 @@ final class BooleanScorer extends BulkScorer {
|
|||
if (minShouldMatch < 1 || minShouldMatch > scorers.size()) {
|
||||
throw new IllegalArgumentException("minShouldMatch should be within 1..num_scorers. Got " + minShouldMatch);
|
||||
}
|
||||
if (scorers.size() <= 1) {
|
||||
throw new IllegalArgumentException("This scorer can only be used with two scorers or more, got " + scorers.size());
|
||||
}
|
||||
for (int i = 0; i < buckets.length; i++) {
|
||||
buckets[i] = new Bucket();
|
||||
}
|
||||
|
@ -237,12 +240,12 @@ final class BooleanScorer extends BulkScorer {
|
|||
}
|
||||
}
|
||||
|
||||
private void scoreWindow(LeafCollector collector, Bits acceptDocs, int base, int min, int max,
|
||||
BulkScorerAndDoc[] scorers, int numScorers) throws IOException {
|
||||
private void scoreWindowIntoBitSetAndReplay(LeafCollector collector, Bits acceptDocs,
|
||||
int base, int min, int max, BulkScorerAndDoc[] scorers, int numScorers) throws IOException {
|
||||
for (int i = 0; i < numScorers; ++i) {
|
||||
final BulkScorerAndDoc scorer = scorers[i];
|
||||
assert scorer.next < max;
|
||||
scorer.score(acceptDocs, min, max);
|
||||
scorer.score(orCollector, acceptDocs, min, max);
|
||||
}
|
||||
|
||||
scoreMatches(collector, base);
|
||||
|
@ -270,14 +273,7 @@ final class BooleanScorer extends BulkScorer {
|
|||
return headTop;
|
||||
}
|
||||
|
||||
private void scoreWindow(LeafCollector collector, Bits acceptDocs, int windowBase, int windowMin, int windowMax) throws IOException {
|
||||
// Fill 'leads' with all scorers from 'head' that are in the right window
|
||||
leads[0] = head.pop();
|
||||
int maxFreq = 1;
|
||||
while (head.size() > 0 && head.top().next < windowMax) {
|
||||
leads[maxFreq++] = head.pop();
|
||||
}
|
||||
|
||||
private void scoreWindowMultipleScorers(LeafCollector collector, Bits acceptDocs, int windowBase, int windowMin, int windowMax, int maxFreq) throws IOException {
|
||||
while (maxFreq < minShouldMatch && maxFreq + tail.size() >= minShouldMatch) {
|
||||
// a match is still possible
|
||||
final BulkScorerAndDoc candidate = tail.pop();
|
||||
|
@ -296,7 +292,7 @@ final class BooleanScorer extends BulkScorer {
|
|||
}
|
||||
tail.clear();
|
||||
|
||||
scoreWindow(collector, acceptDocs, windowBase, windowMin, windowMax, leads, maxFreq);
|
||||
scoreWindowIntoBitSetAndReplay(collector, acceptDocs, windowBase, windowMin, windowMax, leads, maxFreq);
|
||||
}
|
||||
|
||||
// Push back scorers into head and tail
|
||||
|
@ -308,21 +304,64 @@ final class BooleanScorer extends BulkScorer {
|
|||
}
|
||||
}
|
||||
|
||||
private void scoreWindowSingleScorer(BulkScorerAndDoc bulkScorer, LeafCollector collector,
|
||||
Bits acceptDocs, int windowMin, int windowMax, int max) throws IOException {
|
||||
assert tail.size() == 0;
|
||||
final int nextWindowBase = head.top().next & ~MASK;
|
||||
final int end = Math.max(windowMax, Math.min(max, nextWindowBase));
|
||||
|
||||
bulkScorer.score(collector, acceptDocs, windowMin, end);
|
||||
|
||||
// reset the scorer that should be used for the general case
|
||||
collector.setScorer(fakeScorer);
|
||||
}
|
||||
|
||||
private BulkScorerAndDoc scoreWindow(BulkScorerAndDoc top, LeafCollector collector,
|
||||
LeafCollector singleClauseCollector, Bits acceptDocs, int min, int max) throws IOException {
|
||||
final int windowBase = top.next & ~MASK; // find the window that the next match belongs to
|
||||
final int windowMin = Math.max(min, windowBase);
|
||||
final int windowMax = Math.min(max, windowBase + SIZE);
|
||||
|
||||
// Fill 'leads' with all scorers from 'head' that are in the right window
|
||||
leads[0] = head.pop();
|
||||
int maxFreq = 1;
|
||||
while (head.size() > 0 && head.top().next < windowMax) {
|
||||
leads[maxFreq++] = head.pop();
|
||||
}
|
||||
|
||||
if (minShouldMatch == 1 && maxFreq == 1) {
|
||||
// special case: only one scorer can match in the current window,
|
||||
// we can collect directly
|
||||
final BulkScorerAndDoc bulkScorer = leads[0];
|
||||
scoreWindowSingleScorer(bulkScorer, singleClauseCollector, acceptDocs, windowMin, windowMax, max);
|
||||
return head.add(bulkScorer);
|
||||
} else {
|
||||
// general case, collect through a bit set first and then replay
|
||||
scoreWindowMultipleScorers(collector, acceptDocs, windowBase, windowMin, windowMax, maxFreq);
|
||||
return head.top();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
|
||||
fakeScorer.doc = -1;
|
||||
collector.setScorer(fakeScorer);
|
||||
|
||||
final LeafCollector singleClauseCollector;
|
||||
if (coordFactors[1] == 1f) {
|
||||
singleClauseCollector = collector;
|
||||
} else {
|
||||
singleClauseCollector = new FilterLeafCollector(collector) {
|
||||
@Override
|
||||
public void setScorer(Scorer scorer) throws IOException {
|
||||
super.setScorer(new BooleanTopLevelScorers.BoostedScorer(scorer, coordFactors[1]));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
BulkScorerAndDoc top = advance(min);
|
||||
while (top.next < max) {
|
||||
|
||||
final int windowBase = top.next & ~MASK; // find the window that the next match belongs to
|
||||
final int windowMin = Math.max(min, windowBase);
|
||||
final int windowMax = Math.min(max, windowBase + SIZE);
|
||||
|
||||
// general case
|
||||
scoreWindow(collector, acceptDocs, windowBase, windowMin, windowMax);
|
||||
top = head.top();
|
||||
top = scoreWindow(top, collector, singleClauseCollector, acceptDocs, min, max);
|
||||
}
|
||||
|
||||
return top.next;
|
||||
|
|
|
@ -22,6 +22,8 @@ import java.util.Arrays;
|
|||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
|
||||
import org.apache.lucene.util.Bits;
|
||||
|
||||
/** Internal document-at-a-time scorers used to deal with stupid coord() computation */
|
||||
class BooleanTopLevelScorers {
|
||||
|
||||
|
@ -48,7 +50,39 @@ class BooleanTopLevelScorers {
|
|||
return Collections.singleton(new ChildScorer(in, "BOOSTED"));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Used when there is more than one scorer in a query, but a segment
|
||||
* only had one non-null scorer.
|
||||
*/
|
||||
static class BoostedBulkScorer extends BulkScorer {
|
||||
|
||||
final BulkScorer in;
|
||||
final float boost;
|
||||
|
||||
BoostedBulkScorer(BulkScorer scorer, float boost) {
|
||||
this.in = scorer;
|
||||
this.boost = boost;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
|
||||
final LeafCollector wrapped = new FilterLeafCollector(collector) {
|
||||
@Override
|
||||
public void setScorer(Scorer scorer) throws IOException {
|
||||
super.setScorer(new BoostedScorer(scorer, boost));
|
||||
}
|
||||
};
|
||||
return in.score(wrapped, acceptDocs, min, max);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return in.cost();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Used when there are both mandatory and optional clauses, but minShouldMatch
|
||||
* dictates that some of the optional clauses must match. The query is a conjunction,
|
||||
|
|
|
@ -190,7 +190,7 @@ final class BooleanWeight extends Weight {
|
|||
/** Try to build a boolean scorer for this weight. Returns null if {@link BooleanScorer}
|
||||
* cannot be used. */
|
||||
// pkg-private for forcing use of BooleanScorer in tests
|
||||
BooleanScorer booleanScorer(LeafReaderContext context) throws IOException {
|
||||
BulkScorer booleanScorer(LeafReaderContext context) throws IOException {
|
||||
List<BulkScorer> optional = new ArrayList<BulkScorer>();
|
||||
Iterator<BooleanClause> cIter = query.iterator();
|
||||
for (Weight w : weights) {
|
||||
|
@ -222,12 +222,21 @@ final class BooleanWeight extends Weight {
|
|||
return null;
|
||||
}
|
||||
|
||||
if (optional.size() == 1) {
|
||||
BulkScorer opt = optional.get(0);
|
||||
if (!disableCoord && maxCoord > 1) {
|
||||
return new BooleanTopLevelScorers.BoostedBulkScorer(opt, coord(1, maxCoord));
|
||||
} else {
|
||||
return opt;
|
||||
}
|
||||
}
|
||||
|
||||
return new BooleanScorer(this, disableCoord, maxCoord, optional, Math.max(1, query.getMinimumNumberShouldMatch()), needsScores);
|
||||
}
|
||||
|
||||
@Override
|
||||
public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
|
||||
final BooleanScorer bulkScorer = booleanScorer(context);
|
||||
final BulkScorer bulkScorer = booleanScorer(context);
|
||||
if (bulkScorer != null) { // BooleanScorer is applicable
|
||||
// TODO: what is the right heuristic here?
|
||||
final long costThreshold;
|
||||
|
|
|
@ -18,17 +18,24 @@ package org.apache.lucene.search;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.Weight.DefaultBulkScorer;
|
||||
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
public class TestBooleanScorer extends LuceneTestCase {
|
||||
private static final String FIELD = "category";
|
||||
|
@ -141,4 +148,93 @@ public class TestBooleanScorer extends LuceneTestCase {
|
|||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testOptimizeTopLevelClauseOrNull() throws IOException {
|
||||
// When there is a single non-null scorer, this scorer should be used
|
||||
// directly
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||
Document doc = new Document();
|
||||
doc.add(new StringField("foo", "bar", Store.NO));
|
||||
w.addDocument(doc);
|
||||
IndexReader reader = w.getReader();
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
searcher.setQueryCache(null); // so that weights are not wrapped
|
||||
final LeafReaderContext ctx = reader.leaves().get(0);
|
||||
Query query = new BooleanQuery.Builder()
|
||||
.add(new TermQuery(new Term("foo", "bar")), Occur.SHOULD) // existing term
|
||||
.add(new TermQuery(new Term("foo", "baz")), Occur.SHOULD) // missing term
|
||||
.build();
|
||||
|
||||
// no scores -> term scorer
|
||||
Weight weight = searcher.createNormalizedWeight(query, false);
|
||||
BulkScorer scorer = ((BooleanWeight) weight).booleanScorer(ctx);
|
||||
assertTrue(scorer instanceof DefaultBulkScorer); // term scorer
|
||||
|
||||
// disabled coords -> term scorer
|
||||
query = new BooleanQuery.Builder()
|
||||
.add(new TermQuery(new Term("foo", "bar")), Occur.SHOULD) // existing term
|
||||
.add(new TermQuery(new Term("foo", "baz")), Occur.SHOULD) // missing term
|
||||
.setDisableCoord(true)
|
||||
.build();
|
||||
weight = searcher.createNormalizedWeight(query, true);
|
||||
scorer = ((BooleanWeight) weight).booleanScorer(ctx);
|
||||
assertTrue(scorer instanceof DefaultBulkScorer); // term scorer
|
||||
|
||||
// enabled coords -> BoostedBulkScorer
|
||||
searcher.setSimilarity(new ClassicSimilarity());
|
||||
query = new BooleanQuery.Builder()
|
||||
.add(new TermQuery(new Term("foo", "bar")), Occur.SHOULD) // existing term
|
||||
.add(new TermQuery(new Term("foo", "baz")), Occur.SHOULD) // missing term
|
||||
.build();
|
||||
weight = searcher.createNormalizedWeight(query, true);
|
||||
scorer = ((BooleanWeight) weight).booleanScorer(ctx);
|
||||
assertTrue(scorer instanceof BooleanTopLevelScorers.BoostedBulkScorer);
|
||||
|
||||
w.close();
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testSparseClauseOptimization() throws IOException {
|
||||
// When some windows have only one scorer that can match, the scorer will
|
||||
// directly call the collector in this window
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||
Document emptyDoc = new Document();
|
||||
final int numDocs = atLeast(10);
|
||||
for (int d = 0; d < numDocs; ++d) {
|
||||
for (int i = random().nextInt(5000); i >= 0; --i) {
|
||||
w.addDocument(emptyDoc);
|
||||
}
|
||||
Document doc = new Document();
|
||||
for (String value : Arrays.asList("foo", "bar", "baz")) {
|
||||
if (random().nextBoolean()) {
|
||||
doc.add(new StringField("field", value, Store.NO));
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int i = TestUtil.nextInt(random(), 3000, 5000); i >= 0; --i) {
|
||||
w.addDocument(emptyDoc);
|
||||
}
|
||||
if (random().nextBoolean()) {
|
||||
w.forceMerge(1);
|
||||
}
|
||||
IndexReader reader = w.getReader();
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
|
||||
Query query = new BooleanQuery.Builder()
|
||||
.add(new BoostQuery(new TermQuery(new Term("field", "foo")), 3), Occur.SHOULD)
|
||||
.add(new BoostQuery(new TermQuery(new Term("field", "bar")), 3), Occur.SHOULD)
|
||||
.add(new BoostQuery(new TermQuery(new Term("field", "baz")), 3), Occur.SHOULD)
|
||||
.setDisableCoord(random().nextBoolean())
|
||||
.build();
|
||||
|
||||
// duel BS1 vs. BS2
|
||||
QueryUtils.check(random(), query, searcher);
|
||||
|
||||
reader.close();
|
||||
w.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue