LUCENE-6850: Optimize BooleanScorer for sparse clauses.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1710591 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2015-10-26 13:10:00 +00:00
parent b3ee21b7e9
commit 915992b088
5 changed files with 206 additions and 26 deletions

View File

@ -156,6 +156,8 @@ Optimizations
strings larger than 64kb by an amount equal to string's utf8 size.
(Dawid Weiss, Robert Muir, shalin)
* LUCENE-6850: Optimize BooleanScorer for sparse clauses. (Adrien Grand)
Bug Fixes
* LUCENE-6817: ComplexPhraseQueryParser.ComplexPhraseQuery does not display

View File

@ -83,11 +83,11 @@ final class BooleanScorer extends BulkScorer {
}
void advance(int min) throws IOException {
score(null, min, min);
score(orCollector, null, min, min);
}
void score(Bits acceptDocs, int min, int max) throws IOException {
next = scorer.score(orCollector, acceptDocs, min, max);
void score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
next = scorer.score(collector, acceptDocs, min, max);
}
}
@ -179,6 +179,9 @@ final class BooleanScorer extends BulkScorer {
if (minShouldMatch < 1 || minShouldMatch > scorers.size()) {
throw new IllegalArgumentException("minShouldMatch should be within 1..num_scorers. Got " + minShouldMatch);
}
if (scorers.size() <= 1) {
throw new IllegalArgumentException("This scorer can only be used with two scorers or more, got " + scorers.size());
}
for (int i = 0; i < buckets.length; i++) {
buckets[i] = new Bucket();
}
@ -237,12 +240,12 @@ final class BooleanScorer extends BulkScorer {
}
}
private void scoreWindow(LeafCollector collector, Bits acceptDocs, int base, int min, int max,
BulkScorerAndDoc[] scorers, int numScorers) throws IOException {
private void scoreWindowIntoBitSetAndReplay(LeafCollector collector, Bits acceptDocs,
int base, int min, int max, BulkScorerAndDoc[] scorers, int numScorers) throws IOException {
for (int i = 0; i < numScorers; ++i) {
final BulkScorerAndDoc scorer = scorers[i];
assert scorer.next < max;
scorer.score(acceptDocs, min, max);
scorer.score(orCollector, acceptDocs, min, max);
}
scoreMatches(collector, base);
@ -270,14 +273,7 @@ final class BooleanScorer extends BulkScorer {
return headTop;
}
private void scoreWindow(LeafCollector collector, Bits acceptDocs, int windowBase, int windowMin, int windowMax) throws IOException {
// Fill 'leads' with all scorers from 'head' that are in the right window
leads[0] = head.pop();
int maxFreq = 1;
while (head.size() > 0 && head.top().next < windowMax) {
leads[maxFreq++] = head.pop();
}
private void scoreWindowMultipleScorers(LeafCollector collector, Bits acceptDocs, int windowBase, int windowMin, int windowMax, int maxFreq) throws IOException {
while (maxFreq < minShouldMatch && maxFreq + tail.size() >= minShouldMatch) {
// a match is still possible
final BulkScorerAndDoc candidate = tail.pop();
@ -296,7 +292,7 @@ final class BooleanScorer extends BulkScorer {
}
tail.clear();
scoreWindow(collector, acceptDocs, windowBase, windowMin, windowMax, leads, maxFreq);
scoreWindowIntoBitSetAndReplay(collector, acceptDocs, windowBase, windowMin, windowMax, leads, maxFreq);
}
// Push back scorers into head and tail
@ -308,21 +304,64 @@ final class BooleanScorer extends BulkScorer {
}
}
private void scoreWindowSingleScorer(BulkScorerAndDoc bulkScorer, LeafCollector collector,
Bits acceptDocs, int windowMin, int windowMax, int max) throws IOException {
assert tail.size() == 0;
final int nextWindowBase = head.top().next & ~MASK;
final int end = Math.max(windowMax, Math.min(max, nextWindowBase));
bulkScorer.score(collector, acceptDocs, windowMin, end);
// reset the scorer that should be used for the general case
collector.setScorer(fakeScorer);
}
private BulkScorerAndDoc scoreWindow(BulkScorerAndDoc top, LeafCollector collector,
LeafCollector singleClauseCollector, Bits acceptDocs, int min, int max) throws IOException {
final int windowBase = top.next & ~MASK; // find the window that the next match belongs to
final int windowMin = Math.max(min, windowBase);
final int windowMax = Math.min(max, windowBase + SIZE);
// Fill 'leads' with all scorers from 'head' that are in the right window
leads[0] = head.pop();
int maxFreq = 1;
while (head.size() > 0 && head.top().next < windowMax) {
leads[maxFreq++] = head.pop();
}
if (minShouldMatch == 1 && maxFreq == 1) {
// special case: only one scorer can match in the current window,
// we can collect directly
final BulkScorerAndDoc bulkScorer = leads[0];
scoreWindowSingleScorer(bulkScorer, singleClauseCollector, acceptDocs, windowMin, windowMax, max);
return head.add(bulkScorer);
} else {
// general case, collect through a bit set first and then replay
scoreWindowMultipleScorers(collector, acceptDocs, windowBase, windowMin, windowMax, maxFreq);
return head.top();
}
}
@Override
public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
fakeScorer.doc = -1;
collector.setScorer(fakeScorer);
final LeafCollector singleClauseCollector;
if (coordFactors[1] == 1f) {
singleClauseCollector = collector;
} else {
singleClauseCollector = new FilterLeafCollector(collector) {
@Override
public void setScorer(Scorer scorer) throws IOException {
super.setScorer(new BooleanTopLevelScorers.BoostedScorer(scorer, coordFactors[1]));
}
};
}
BulkScorerAndDoc top = advance(min);
while (top.next < max) {
final int windowBase = top.next & ~MASK; // find the window that the next match belongs to
final int windowMin = Math.max(min, windowBase);
final int windowMax = Math.min(max, windowBase + SIZE);
// general case
scoreWindow(collector, acceptDocs, windowBase, windowMin, windowMax);
top = head.top();
top = scoreWindow(top, collector, singleClauseCollector, acceptDocs, min, max);
}
return top.next;

View File

@ -22,6 +22,8 @@ import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import org.apache.lucene.util.Bits;
/** Internal document-at-a-time scorers used to deal with stupid coord() computation */
class BooleanTopLevelScorers {
@ -49,6 +51,38 @@ class BooleanTopLevelScorers {
}
}
/**
* Used when there is more than one scorer in a query, but a segment
* only had one non-null scorer.
*/
static class BoostedBulkScorer extends BulkScorer {
final BulkScorer in;
final float boost;
BoostedBulkScorer(BulkScorer scorer, float boost) {
this.in = scorer;
this.boost = boost;
}
@Override
public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
final LeafCollector wrapped = new FilterLeafCollector(collector) {
@Override
public void setScorer(Scorer scorer) throws IOException {
super.setScorer(new BoostedScorer(scorer, boost));
}
};
return in.score(wrapped, acceptDocs, min, max);
}
@Override
public long cost() {
return in.cost();
}
}
/**
* Used when there are both mandatory and optional clauses, but minShouldMatch
* dictates that some of the optional clauses must match. The query is a conjunction,

View File

@ -190,7 +190,7 @@ final class BooleanWeight extends Weight {
/** Try to build a boolean scorer for this weight. Returns null if {@link BooleanScorer}
* cannot be used. */
// pkg-private for forcing use of BooleanScorer in tests
BooleanScorer booleanScorer(LeafReaderContext context) throws IOException {
BulkScorer booleanScorer(LeafReaderContext context) throws IOException {
List<BulkScorer> optional = new ArrayList<BulkScorer>();
Iterator<BooleanClause> cIter = query.iterator();
for (Weight w : weights) {
@ -222,12 +222,21 @@ final class BooleanWeight extends Weight {
return null;
}
if (optional.size() == 1) {
BulkScorer opt = optional.get(0);
if (!disableCoord && maxCoord > 1) {
return new BooleanTopLevelScorers.BoostedBulkScorer(opt, coord(1, maxCoord));
} else {
return opt;
}
}
return new BooleanScorer(this, disableCoord, maxCoord, optional, Math.max(1, query.getMinimumNumberShouldMatch()), needsScores);
}
@Override
public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
final BooleanScorer bulkScorer = booleanScorer(context);
final BulkScorer bulkScorer = booleanScorer(context);
if (bulkScorer != null) { // BooleanScorer is applicable
// TODO: what is the right heuristic here?
final long costThreshold;

View File

@ -18,17 +18,24 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
import java.util.Arrays;
import java.util.Set;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.Weight.DefaultBulkScorer;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
public class TestBooleanScorer extends LuceneTestCase {
private static final String FIELD = "category";
@ -141,4 +148,93 @@ public class TestBooleanScorer extends LuceneTestCase {
r.close();
dir.close();
}
public void testOptimizeTopLevelClauseOrNull() throws IOException {
// When there is a single non-null scorer, this scorer should be used
// directly
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(new StringField("foo", "bar", Store.NO));
w.addDocument(doc);
IndexReader reader = w.getReader();
IndexSearcher searcher = new IndexSearcher(reader);
searcher.setQueryCache(null); // so that weights are not wrapped
final LeafReaderContext ctx = reader.leaves().get(0);
Query query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.SHOULD) // existing term
.add(new TermQuery(new Term("foo", "baz")), Occur.SHOULD) // missing term
.build();
// no scores -> term scorer
Weight weight = searcher.createNormalizedWeight(query, false);
BulkScorer scorer = ((BooleanWeight) weight).booleanScorer(ctx);
assertTrue(scorer instanceof DefaultBulkScorer); // term scorer
// disabled coords -> term scorer
query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.SHOULD) // existing term
.add(new TermQuery(new Term("foo", "baz")), Occur.SHOULD) // missing term
.setDisableCoord(true)
.build();
weight = searcher.createNormalizedWeight(query, true);
scorer = ((BooleanWeight) weight).booleanScorer(ctx);
assertTrue(scorer instanceof DefaultBulkScorer); // term scorer
// enabled coords -> BoostedBulkScorer
searcher.setSimilarity(new ClassicSimilarity());
query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.SHOULD) // existing term
.add(new TermQuery(new Term("foo", "baz")), Occur.SHOULD) // missing term
.build();
weight = searcher.createNormalizedWeight(query, true);
scorer = ((BooleanWeight) weight).booleanScorer(ctx);
assertTrue(scorer instanceof BooleanTopLevelScorers.BoostedBulkScorer);
w.close();
reader.close();
dir.close();
}
public void testSparseClauseOptimization() throws IOException {
// When some windows have only one scorer that can match, the scorer will
// directly call the collector in this window
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
Document emptyDoc = new Document();
final int numDocs = atLeast(10);
for (int d = 0; d < numDocs; ++d) {
for (int i = random().nextInt(5000); i >= 0; --i) {
w.addDocument(emptyDoc);
}
Document doc = new Document();
for (String value : Arrays.asList("foo", "bar", "baz")) {
if (random().nextBoolean()) {
doc.add(new StringField("field", value, Store.NO));
}
}
}
for (int i = TestUtil.nextInt(random(), 3000, 5000); i >= 0; --i) {
w.addDocument(emptyDoc);
}
if (random().nextBoolean()) {
w.forceMerge(1);
}
IndexReader reader = w.getReader();
IndexSearcher searcher = newSearcher(reader);
Query query = new BooleanQuery.Builder()
.add(new BoostQuery(new TermQuery(new Term("field", "foo")), 3), Occur.SHOULD)
.add(new BoostQuery(new TermQuery(new Term("field", "bar")), 3), Occur.SHOULD)
.add(new BoostQuery(new TermQuery(new Term("field", "baz")), 3), Occur.SHOULD)
.setDisableCoord(random().nextBoolean())
.build();
// duel BS1 vs. BS2
QueryUtils.check(random(), query, searcher);
reader.close();
w.close();
dir.close();
}
}