LUCENE-6850: Optimize BooleanScorer for sparse clauses.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1710591 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2015-10-26 13:10:00 +00:00
parent b3ee21b7e9
commit 915992b088
5 changed files with 206 additions and 26 deletions

View File

@ -156,6 +156,8 @@ Optimizations
strings larger than 64kb by an amount equal to string's utf8 size. strings larger than 64kb by an amount equal to string's utf8 size.
(Dawid Weiss, Robert Muir, shalin) (Dawid Weiss, Robert Muir, shalin)
* LUCENE-6850: Optimize BooleanScorer for sparse clauses. (Adrien Grand)
Bug Fixes Bug Fixes
* LUCENE-6817: ComplexPhraseQueryParser.ComplexPhraseQuery does not display * LUCENE-6817: ComplexPhraseQueryParser.ComplexPhraseQuery does not display

View File

@ -83,11 +83,11 @@ final class BooleanScorer extends BulkScorer {
} }
void advance(int min) throws IOException { void advance(int min) throws IOException {
score(null, min, min); score(orCollector, null, min, min);
} }
void score(Bits acceptDocs, int min, int max) throws IOException { void score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
next = scorer.score(orCollector, acceptDocs, min, max); next = scorer.score(collector, acceptDocs, min, max);
} }
} }
@ -179,6 +179,9 @@ final class BooleanScorer extends BulkScorer {
if (minShouldMatch < 1 || minShouldMatch > scorers.size()) { if (minShouldMatch < 1 || minShouldMatch > scorers.size()) {
throw new IllegalArgumentException("minShouldMatch should be within 1..num_scorers. Got " + minShouldMatch); throw new IllegalArgumentException("minShouldMatch should be within 1..num_scorers. Got " + minShouldMatch);
} }
if (scorers.size() <= 1) {
throw new IllegalArgumentException("This scorer can only be used with two scorers or more, got " + scorers.size());
}
for (int i = 0; i < buckets.length; i++) { for (int i = 0; i < buckets.length; i++) {
buckets[i] = new Bucket(); buckets[i] = new Bucket();
} }
@ -237,12 +240,12 @@ final class BooleanScorer extends BulkScorer {
} }
} }
private void scoreWindow(LeafCollector collector, Bits acceptDocs, int base, int min, int max, private void scoreWindowIntoBitSetAndReplay(LeafCollector collector, Bits acceptDocs,
BulkScorerAndDoc[] scorers, int numScorers) throws IOException { int base, int min, int max, BulkScorerAndDoc[] scorers, int numScorers) throws IOException {
for (int i = 0; i < numScorers; ++i) { for (int i = 0; i < numScorers; ++i) {
final BulkScorerAndDoc scorer = scorers[i]; final BulkScorerAndDoc scorer = scorers[i];
assert scorer.next < max; assert scorer.next < max;
scorer.score(acceptDocs, min, max); scorer.score(orCollector, acceptDocs, min, max);
} }
scoreMatches(collector, base); scoreMatches(collector, base);
@ -270,14 +273,7 @@ final class BooleanScorer extends BulkScorer {
return headTop; return headTop;
} }
private void scoreWindow(LeafCollector collector, Bits acceptDocs, int windowBase, int windowMin, int windowMax) throws IOException { private void scoreWindowMultipleScorers(LeafCollector collector, Bits acceptDocs, int windowBase, int windowMin, int windowMax, int maxFreq) throws IOException {
// Fill 'leads' with all scorers from 'head' that are in the right window
leads[0] = head.pop();
int maxFreq = 1;
while (head.size() > 0 && head.top().next < windowMax) {
leads[maxFreq++] = head.pop();
}
while (maxFreq < minShouldMatch && maxFreq + tail.size() >= minShouldMatch) { while (maxFreq < minShouldMatch && maxFreq + tail.size() >= minShouldMatch) {
// a match is still possible // a match is still possible
final BulkScorerAndDoc candidate = tail.pop(); final BulkScorerAndDoc candidate = tail.pop();
@ -296,7 +292,7 @@ final class BooleanScorer extends BulkScorer {
} }
tail.clear(); tail.clear();
scoreWindow(collector, acceptDocs, windowBase, windowMin, windowMax, leads, maxFreq); scoreWindowIntoBitSetAndReplay(collector, acceptDocs, windowBase, windowMin, windowMax, leads, maxFreq);
} }
// Push back scorers into head and tail // Push back scorers into head and tail
@ -308,21 +304,64 @@ final class BooleanScorer extends BulkScorer {
} }
} }
private void scoreWindowSingleScorer(BulkScorerAndDoc bulkScorer, LeafCollector collector,
Bits acceptDocs, int windowMin, int windowMax, int max) throws IOException {
assert tail.size() == 0;
final int nextWindowBase = head.top().next & ~MASK;
final int end = Math.max(windowMax, Math.min(max, nextWindowBase));
bulkScorer.score(collector, acceptDocs, windowMin, end);
// reset the scorer that should be used for the general case
collector.setScorer(fakeScorer);
}
private BulkScorerAndDoc scoreWindow(BulkScorerAndDoc top, LeafCollector collector,
LeafCollector singleClauseCollector, Bits acceptDocs, int min, int max) throws IOException {
final int windowBase = top.next & ~MASK; // find the window that the next match belongs to
final int windowMin = Math.max(min, windowBase);
final int windowMax = Math.min(max, windowBase + SIZE);
// Fill 'leads' with all scorers from 'head' that are in the right window
leads[0] = head.pop();
int maxFreq = 1;
while (head.size() > 0 && head.top().next < windowMax) {
leads[maxFreq++] = head.pop();
}
if (minShouldMatch == 1 && maxFreq == 1) {
// special case: only one scorer can match in the current window,
// we can collect directly
final BulkScorerAndDoc bulkScorer = leads[0];
scoreWindowSingleScorer(bulkScorer, singleClauseCollector, acceptDocs, windowMin, windowMax, max);
return head.add(bulkScorer);
} else {
// general case, collect through a bit set first and then replay
scoreWindowMultipleScorers(collector, acceptDocs, windowBase, windowMin, windowMax, maxFreq);
return head.top();
}
}
@Override @Override
public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException { public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
fakeScorer.doc = -1; fakeScorer.doc = -1;
collector.setScorer(fakeScorer); collector.setScorer(fakeScorer);
final LeafCollector singleClauseCollector;
if (coordFactors[1] == 1f) {
singleClauseCollector = collector;
} else {
singleClauseCollector = new FilterLeafCollector(collector) {
@Override
public void setScorer(Scorer scorer) throws IOException {
super.setScorer(new BooleanTopLevelScorers.BoostedScorer(scorer, coordFactors[1]));
}
};
}
BulkScorerAndDoc top = advance(min); BulkScorerAndDoc top = advance(min);
while (top.next < max) { while (top.next < max) {
top = scoreWindow(top, collector, singleClauseCollector, acceptDocs, min, max);
final int windowBase = top.next & ~MASK; // find the window that the next match belongs to
final int windowMin = Math.max(min, windowBase);
final int windowMax = Math.min(max, windowBase + SIZE);
// general case
scoreWindow(collector, acceptDocs, windowBase, windowMin, windowMax);
top = head.top();
} }
return top.next; return top.next;

View File

@ -22,6 +22,8 @@ import java.util.Arrays;
import java.util.Collection; import java.util.Collection;
import java.util.Collections; import java.util.Collections;
import org.apache.lucene.util.Bits;
/** Internal document-at-a-time scorers used to deal with stupid coord() computation */ /** Internal document-at-a-time scorers used to deal with stupid coord() computation */
class BooleanTopLevelScorers { class BooleanTopLevelScorers {
@ -48,7 +50,39 @@ class BooleanTopLevelScorers {
return Collections.singleton(new ChildScorer(in, "BOOSTED")); return Collections.singleton(new ChildScorer(in, "BOOSTED"));
} }
} }
/**
* Used when there is more than one scorer in a query, but a segment
* only had one non-null scorer.
*/
static class BoostedBulkScorer extends BulkScorer {
final BulkScorer in;
final float boost;
BoostedBulkScorer(BulkScorer scorer, float boost) {
this.in = scorer;
this.boost = boost;
}
@Override
public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
final LeafCollector wrapped = new FilterLeafCollector(collector) {
@Override
public void setScorer(Scorer scorer) throws IOException {
super.setScorer(new BoostedScorer(scorer, boost));
}
};
return in.score(wrapped, acceptDocs, min, max);
}
@Override
public long cost() {
return in.cost();
}
}
/** /**
* Used when there are both mandatory and optional clauses, but minShouldMatch * Used when there are both mandatory and optional clauses, but minShouldMatch
* dictates that some of the optional clauses must match. The query is a conjunction, * dictates that some of the optional clauses must match. The query is a conjunction,

View File

@ -190,7 +190,7 @@ final class BooleanWeight extends Weight {
/** Try to build a boolean scorer for this weight. Returns null if {@link BooleanScorer} /** Try to build a boolean scorer for this weight. Returns null if {@link BooleanScorer}
* cannot be used. */ * cannot be used. */
// pkg-private for forcing use of BooleanScorer in tests // pkg-private for forcing use of BooleanScorer in tests
BooleanScorer booleanScorer(LeafReaderContext context) throws IOException { BulkScorer booleanScorer(LeafReaderContext context) throws IOException {
List<BulkScorer> optional = new ArrayList<BulkScorer>(); List<BulkScorer> optional = new ArrayList<BulkScorer>();
Iterator<BooleanClause> cIter = query.iterator(); Iterator<BooleanClause> cIter = query.iterator();
for (Weight w : weights) { for (Weight w : weights) {
@ -222,12 +222,21 @@ final class BooleanWeight extends Weight {
return null; return null;
} }
if (optional.size() == 1) {
BulkScorer opt = optional.get(0);
if (!disableCoord && maxCoord > 1) {
return new BooleanTopLevelScorers.BoostedBulkScorer(opt, coord(1, maxCoord));
} else {
return opt;
}
}
return new BooleanScorer(this, disableCoord, maxCoord, optional, Math.max(1, query.getMinimumNumberShouldMatch()), needsScores); return new BooleanScorer(this, disableCoord, maxCoord, optional, Math.max(1, query.getMinimumNumberShouldMatch()), needsScores);
} }
@Override @Override
public BulkScorer bulkScorer(LeafReaderContext context) throws IOException { public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
final BooleanScorer bulkScorer = booleanScorer(context); final BulkScorer bulkScorer = booleanScorer(context);
if (bulkScorer != null) { // BooleanScorer is applicable if (bulkScorer != null) { // BooleanScorer is applicable
// TODO: what is the right heuristic here? // TODO: what is the right heuristic here?
final long costThreshold; final long costThreshold;

View File

@ -18,17 +18,24 @@ package org.apache.lucene.search;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.Arrays;
import java.util.Set; import java.util.Set;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.Weight.DefaultBulkScorer;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
public class TestBooleanScorer extends LuceneTestCase { public class TestBooleanScorer extends LuceneTestCase {
private static final String FIELD = "category"; private static final String FIELD = "category";
@ -141,4 +148,93 @@ public class TestBooleanScorer extends LuceneTestCase {
r.close(); r.close();
dir.close(); dir.close();
} }
public void testOptimizeTopLevelClauseOrNull() throws IOException {
// When there is a single non-null scorer, this scorer should be used
// directly
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(new StringField("foo", "bar", Store.NO));
w.addDocument(doc);
IndexReader reader = w.getReader();
IndexSearcher searcher = new IndexSearcher(reader);
searcher.setQueryCache(null); // so that weights are not wrapped
final LeafReaderContext ctx = reader.leaves().get(0);
Query query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.SHOULD) // existing term
.add(new TermQuery(new Term("foo", "baz")), Occur.SHOULD) // missing term
.build();
// no scores -> term scorer
Weight weight = searcher.createNormalizedWeight(query, false);
BulkScorer scorer = ((BooleanWeight) weight).booleanScorer(ctx);
assertTrue(scorer instanceof DefaultBulkScorer); // term scorer
// disabled coords -> term scorer
query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.SHOULD) // existing term
.add(new TermQuery(new Term("foo", "baz")), Occur.SHOULD) // missing term
.setDisableCoord(true)
.build();
weight = searcher.createNormalizedWeight(query, true);
scorer = ((BooleanWeight) weight).booleanScorer(ctx);
assertTrue(scorer instanceof DefaultBulkScorer); // term scorer
// enabled coords -> BoostedBulkScorer
searcher.setSimilarity(new ClassicSimilarity());
query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.SHOULD) // existing term
.add(new TermQuery(new Term("foo", "baz")), Occur.SHOULD) // missing term
.build();
weight = searcher.createNormalizedWeight(query, true);
scorer = ((BooleanWeight) weight).booleanScorer(ctx);
assertTrue(scorer instanceof BooleanTopLevelScorers.BoostedBulkScorer);
w.close();
reader.close();
dir.close();
}
public void testSparseClauseOptimization() throws IOException {
// When some windows have only one scorer that can match, the scorer will
// directly call the collector in this window
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
Document emptyDoc = new Document();
final int numDocs = atLeast(10);
for (int d = 0; d < numDocs; ++d) {
for (int i = random().nextInt(5000); i >= 0; --i) {
w.addDocument(emptyDoc);
}
Document doc = new Document();
for (String value : Arrays.asList("foo", "bar", "baz")) {
if (random().nextBoolean()) {
doc.add(new StringField("field", value, Store.NO));
}
}
}
for (int i = TestUtil.nextInt(random(), 3000, 5000); i >= 0; --i) {
w.addDocument(emptyDoc);
}
if (random().nextBoolean()) {
w.forceMerge(1);
}
IndexReader reader = w.getReader();
IndexSearcher searcher = newSearcher(reader);
Query query = new BooleanQuery.Builder()
.add(new BoostQuery(new TermQuery(new Term("field", "foo")), 3), Occur.SHOULD)
.add(new BoostQuery(new TermQuery(new Term("field", "bar")), 3), Occur.SHOULD)
.add(new BoostQuery(new TermQuery(new Term("field", "baz")), 3), Occur.SHOULD)
.setDisableCoord(random().nextBoolean())
.build();
// duel BS1 vs. BS2
QueryUtils.check(random(), query, searcher);
reader.close();
w.close();
dir.close();
}
} }