mirror of https://github.com/apache/lucene.git
LUCENE-6850: Optimize BooleanScorer for sparse clauses.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1710591 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b3ee21b7e9
commit
915992b088
|
@ -156,6 +156,8 @@ Optimizations
|
||||||
strings larger than 64kb by an amount equal to string's utf8 size.
|
strings larger than 64kb by an amount equal to string's utf8 size.
|
||||||
(Dawid Weiss, Robert Muir, shalin)
|
(Dawid Weiss, Robert Muir, shalin)
|
||||||
|
|
||||||
|
* LUCENE-6850: Optimize BooleanScorer for sparse clauses. (Adrien Grand)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
|
|
||||||
* LUCENE-6817: ComplexPhraseQueryParser.ComplexPhraseQuery does not display
|
* LUCENE-6817: ComplexPhraseQueryParser.ComplexPhraseQuery does not display
|
||||||
|
|
|
@ -83,11 +83,11 @@ final class BooleanScorer extends BulkScorer {
|
||||||
}
|
}
|
||||||
|
|
||||||
void advance(int min) throws IOException {
|
void advance(int min) throws IOException {
|
||||||
score(null, min, min);
|
score(orCollector, null, min, min);
|
||||||
}
|
}
|
||||||
|
|
||||||
void score(Bits acceptDocs, int min, int max) throws IOException {
|
void score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
|
||||||
next = scorer.score(orCollector, acceptDocs, min, max);
|
next = scorer.score(collector, acceptDocs, min, max);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -179,6 +179,9 @@ final class BooleanScorer extends BulkScorer {
|
||||||
if (minShouldMatch < 1 || minShouldMatch > scorers.size()) {
|
if (minShouldMatch < 1 || minShouldMatch > scorers.size()) {
|
||||||
throw new IllegalArgumentException("minShouldMatch should be within 1..num_scorers. Got " + minShouldMatch);
|
throw new IllegalArgumentException("minShouldMatch should be within 1..num_scorers. Got " + minShouldMatch);
|
||||||
}
|
}
|
||||||
|
if (scorers.size() <= 1) {
|
||||||
|
throw new IllegalArgumentException("This scorer can only be used with two scorers or more, got " + scorers.size());
|
||||||
|
}
|
||||||
for (int i = 0; i < buckets.length; i++) {
|
for (int i = 0; i < buckets.length; i++) {
|
||||||
buckets[i] = new Bucket();
|
buckets[i] = new Bucket();
|
||||||
}
|
}
|
||||||
|
@ -237,12 +240,12 @@ final class BooleanScorer extends BulkScorer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void scoreWindow(LeafCollector collector, Bits acceptDocs, int base, int min, int max,
|
private void scoreWindowIntoBitSetAndReplay(LeafCollector collector, Bits acceptDocs,
|
||||||
BulkScorerAndDoc[] scorers, int numScorers) throws IOException {
|
int base, int min, int max, BulkScorerAndDoc[] scorers, int numScorers) throws IOException {
|
||||||
for (int i = 0; i < numScorers; ++i) {
|
for (int i = 0; i < numScorers; ++i) {
|
||||||
final BulkScorerAndDoc scorer = scorers[i];
|
final BulkScorerAndDoc scorer = scorers[i];
|
||||||
assert scorer.next < max;
|
assert scorer.next < max;
|
||||||
scorer.score(acceptDocs, min, max);
|
scorer.score(orCollector, acceptDocs, min, max);
|
||||||
}
|
}
|
||||||
|
|
||||||
scoreMatches(collector, base);
|
scoreMatches(collector, base);
|
||||||
|
@ -270,14 +273,7 @@ final class BooleanScorer extends BulkScorer {
|
||||||
return headTop;
|
return headTop;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void scoreWindow(LeafCollector collector, Bits acceptDocs, int windowBase, int windowMin, int windowMax) throws IOException {
|
private void scoreWindowMultipleScorers(LeafCollector collector, Bits acceptDocs, int windowBase, int windowMin, int windowMax, int maxFreq) throws IOException {
|
||||||
// Fill 'leads' with all scorers from 'head' that are in the right window
|
|
||||||
leads[0] = head.pop();
|
|
||||||
int maxFreq = 1;
|
|
||||||
while (head.size() > 0 && head.top().next < windowMax) {
|
|
||||||
leads[maxFreq++] = head.pop();
|
|
||||||
}
|
|
||||||
|
|
||||||
while (maxFreq < minShouldMatch && maxFreq + tail.size() >= minShouldMatch) {
|
while (maxFreq < minShouldMatch && maxFreq + tail.size() >= minShouldMatch) {
|
||||||
// a match is still possible
|
// a match is still possible
|
||||||
final BulkScorerAndDoc candidate = tail.pop();
|
final BulkScorerAndDoc candidate = tail.pop();
|
||||||
|
@ -296,7 +292,7 @@ final class BooleanScorer extends BulkScorer {
|
||||||
}
|
}
|
||||||
tail.clear();
|
tail.clear();
|
||||||
|
|
||||||
scoreWindow(collector, acceptDocs, windowBase, windowMin, windowMax, leads, maxFreq);
|
scoreWindowIntoBitSetAndReplay(collector, acceptDocs, windowBase, windowMin, windowMax, leads, maxFreq);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Push back scorers into head and tail
|
// Push back scorers into head and tail
|
||||||
|
@ -308,21 +304,64 @@ final class BooleanScorer extends BulkScorer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void scoreWindowSingleScorer(BulkScorerAndDoc bulkScorer, LeafCollector collector,
|
||||||
|
Bits acceptDocs, int windowMin, int windowMax, int max) throws IOException {
|
||||||
|
assert tail.size() == 0;
|
||||||
|
final int nextWindowBase = head.top().next & ~MASK;
|
||||||
|
final int end = Math.max(windowMax, Math.min(max, nextWindowBase));
|
||||||
|
|
||||||
|
bulkScorer.score(collector, acceptDocs, windowMin, end);
|
||||||
|
|
||||||
|
// reset the scorer that should be used for the general case
|
||||||
|
collector.setScorer(fakeScorer);
|
||||||
|
}
|
||||||
|
|
||||||
|
private BulkScorerAndDoc scoreWindow(BulkScorerAndDoc top, LeafCollector collector,
|
||||||
|
LeafCollector singleClauseCollector, Bits acceptDocs, int min, int max) throws IOException {
|
||||||
|
final int windowBase = top.next & ~MASK; // find the window that the next match belongs to
|
||||||
|
final int windowMin = Math.max(min, windowBase);
|
||||||
|
final int windowMax = Math.min(max, windowBase + SIZE);
|
||||||
|
|
||||||
|
// Fill 'leads' with all scorers from 'head' that are in the right window
|
||||||
|
leads[0] = head.pop();
|
||||||
|
int maxFreq = 1;
|
||||||
|
while (head.size() > 0 && head.top().next < windowMax) {
|
||||||
|
leads[maxFreq++] = head.pop();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (minShouldMatch == 1 && maxFreq == 1) {
|
||||||
|
// special case: only one scorer can match in the current window,
|
||||||
|
// we can collect directly
|
||||||
|
final BulkScorerAndDoc bulkScorer = leads[0];
|
||||||
|
scoreWindowSingleScorer(bulkScorer, singleClauseCollector, acceptDocs, windowMin, windowMax, max);
|
||||||
|
return head.add(bulkScorer);
|
||||||
|
} else {
|
||||||
|
// general case, collect through a bit set first and then replay
|
||||||
|
scoreWindowMultipleScorers(collector, acceptDocs, windowBase, windowMin, windowMax, maxFreq);
|
||||||
|
return head.top();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
|
public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
|
||||||
fakeScorer.doc = -1;
|
fakeScorer.doc = -1;
|
||||||
collector.setScorer(fakeScorer);
|
collector.setScorer(fakeScorer);
|
||||||
|
|
||||||
|
final LeafCollector singleClauseCollector;
|
||||||
|
if (coordFactors[1] == 1f) {
|
||||||
|
singleClauseCollector = collector;
|
||||||
|
} else {
|
||||||
|
singleClauseCollector = new FilterLeafCollector(collector) {
|
||||||
|
@Override
|
||||||
|
public void setScorer(Scorer scorer) throws IOException {
|
||||||
|
super.setScorer(new BooleanTopLevelScorers.BoostedScorer(scorer, coordFactors[1]));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
BulkScorerAndDoc top = advance(min);
|
BulkScorerAndDoc top = advance(min);
|
||||||
while (top.next < max) {
|
while (top.next < max) {
|
||||||
|
top = scoreWindow(top, collector, singleClauseCollector, acceptDocs, min, max);
|
||||||
final int windowBase = top.next & ~MASK; // find the window that the next match belongs to
|
|
||||||
final int windowMin = Math.max(min, windowBase);
|
|
||||||
final int windowMax = Math.min(max, windowBase + SIZE);
|
|
||||||
|
|
||||||
// general case
|
|
||||||
scoreWindow(collector, acceptDocs, windowBase, windowMin, windowMax);
|
|
||||||
top = head.top();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return top.next;
|
return top.next;
|
||||||
|
|
|
@ -22,6 +22,8 @@ import java.util.Arrays;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.Bits;
|
||||||
|
|
||||||
/** Internal document-at-a-time scorers used to deal with stupid coord() computation */
|
/** Internal document-at-a-time scorers used to deal with stupid coord() computation */
|
||||||
class BooleanTopLevelScorers {
|
class BooleanTopLevelScorers {
|
||||||
|
|
||||||
|
@ -49,6 +51,38 @@ class BooleanTopLevelScorers {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Used when there is more than one scorer in a query, but a segment
|
||||||
|
* only had one non-null scorer.
|
||||||
|
*/
|
||||||
|
static class BoostedBulkScorer extends BulkScorer {
|
||||||
|
|
||||||
|
final BulkScorer in;
|
||||||
|
final float boost;
|
||||||
|
|
||||||
|
BoostedBulkScorer(BulkScorer scorer, float boost) {
|
||||||
|
this.in = scorer;
|
||||||
|
this.boost = boost;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
|
||||||
|
final LeafCollector wrapped = new FilterLeafCollector(collector) {
|
||||||
|
@Override
|
||||||
|
public void setScorer(Scorer scorer) throws IOException {
|
||||||
|
super.setScorer(new BoostedScorer(scorer, boost));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
return in.score(wrapped, acceptDocs, min, max);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long cost() {
|
||||||
|
return in.cost();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Used when there are both mandatory and optional clauses, but minShouldMatch
|
* Used when there are both mandatory and optional clauses, but minShouldMatch
|
||||||
* dictates that some of the optional clauses must match. The query is a conjunction,
|
* dictates that some of the optional clauses must match. The query is a conjunction,
|
||||||
|
|
|
@ -190,7 +190,7 @@ final class BooleanWeight extends Weight {
|
||||||
/** Try to build a boolean scorer for this weight. Returns null if {@link BooleanScorer}
|
/** Try to build a boolean scorer for this weight. Returns null if {@link BooleanScorer}
|
||||||
* cannot be used. */
|
* cannot be used. */
|
||||||
// pkg-private for forcing use of BooleanScorer in tests
|
// pkg-private for forcing use of BooleanScorer in tests
|
||||||
BooleanScorer booleanScorer(LeafReaderContext context) throws IOException {
|
BulkScorer booleanScorer(LeafReaderContext context) throws IOException {
|
||||||
List<BulkScorer> optional = new ArrayList<BulkScorer>();
|
List<BulkScorer> optional = new ArrayList<BulkScorer>();
|
||||||
Iterator<BooleanClause> cIter = query.iterator();
|
Iterator<BooleanClause> cIter = query.iterator();
|
||||||
for (Weight w : weights) {
|
for (Weight w : weights) {
|
||||||
|
@ -222,12 +222,21 @@ final class BooleanWeight extends Weight {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (optional.size() == 1) {
|
||||||
|
BulkScorer opt = optional.get(0);
|
||||||
|
if (!disableCoord && maxCoord > 1) {
|
||||||
|
return new BooleanTopLevelScorers.BoostedBulkScorer(opt, coord(1, maxCoord));
|
||||||
|
} else {
|
||||||
|
return opt;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return new BooleanScorer(this, disableCoord, maxCoord, optional, Math.max(1, query.getMinimumNumberShouldMatch()), needsScores);
|
return new BooleanScorer(this, disableCoord, maxCoord, optional, Math.max(1, query.getMinimumNumberShouldMatch()), needsScores);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
|
public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
|
||||||
final BooleanScorer bulkScorer = booleanScorer(context);
|
final BulkScorer bulkScorer = booleanScorer(context);
|
||||||
if (bulkScorer != null) { // BooleanScorer is applicable
|
if (bulkScorer != null) { // BooleanScorer is applicable
|
||||||
// TODO: what is the right heuristic here?
|
// TODO: what is the right heuristic here?
|
||||||
final long costThreshold;
|
final long costThreshold;
|
||||||
|
|
|
@ -18,17 +18,24 @@ package org.apache.lucene.search;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.document.StringField;
|
||||||
|
import org.apache.lucene.document.Field.Store;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.LeafReaderContext;
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
import org.apache.lucene.index.RandomIndexWriter;
|
import org.apache.lucene.index.RandomIndexWriter;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.search.BooleanClause.Occur;
|
||||||
|
import org.apache.lucene.search.Weight.DefaultBulkScorer;
|
||||||
|
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.util.Bits;
|
import org.apache.lucene.util.Bits;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util.TestUtil;
|
||||||
|
|
||||||
public class TestBooleanScorer extends LuceneTestCase {
|
public class TestBooleanScorer extends LuceneTestCase {
|
||||||
private static final String FIELD = "category";
|
private static final String FIELD = "category";
|
||||||
|
@ -141,4 +148,93 @@ public class TestBooleanScorer extends LuceneTestCase {
|
||||||
r.close();
|
r.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testOptimizeTopLevelClauseOrNull() throws IOException {
|
||||||
|
// When there is a single non-null scorer, this scorer should be used
|
||||||
|
// directly
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new StringField("foo", "bar", Store.NO));
|
||||||
|
w.addDocument(doc);
|
||||||
|
IndexReader reader = w.getReader();
|
||||||
|
IndexSearcher searcher = new IndexSearcher(reader);
|
||||||
|
searcher.setQueryCache(null); // so that weights are not wrapped
|
||||||
|
final LeafReaderContext ctx = reader.leaves().get(0);
|
||||||
|
Query query = new BooleanQuery.Builder()
|
||||||
|
.add(new TermQuery(new Term("foo", "bar")), Occur.SHOULD) // existing term
|
||||||
|
.add(new TermQuery(new Term("foo", "baz")), Occur.SHOULD) // missing term
|
||||||
|
.build();
|
||||||
|
|
||||||
|
// no scores -> term scorer
|
||||||
|
Weight weight = searcher.createNormalizedWeight(query, false);
|
||||||
|
BulkScorer scorer = ((BooleanWeight) weight).booleanScorer(ctx);
|
||||||
|
assertTrue(scorer instanceof DefaultBulkScorer); // term scorer
|
||||||
|
|
||||||
|
// disabled coords -> term scorer
|
||||||
|
query = new BooleanQuery.Builder()
|
||||||
|
.add(new TermQuery(new Term("foo", "bar")), Occur.SHOULD) // existing term
|
||||||
|
.add(new TermQuery(new Term("foo", "baz")), Occur.SHOULD) // missing term
|
||||||
|
.setDisableCoord(true)
|
||||||
|
.build();
|
||||||
|
weight = searcher.createNormalizedWeight(query, true);
|
||||||
|
scorer = ((BooleanWeight) weight).booleanScorer(ctx);
|
||||||
|
assertTrue(scorer instanceof DefaultBulkScorer); // term scorer
|
||||||
|
|
||||||
|
// enabled coords -> BoostedBulkScorer
|
||||||
|
searcher.setSimilarity(new ClassicSimilarity());
|
||||||
|
query = new BooleanQuery.Builder()
|
||||||
|
.add(new TermQuery(new Term("foo", "bar")), Occur.SHOULD) // existing term
|
||||||
|
.add(new TermQuery(new Term("foo", "baz")), Occur.SHOULD) // missing term
|
||||||
|
.build();
|
||||||
|
weight = searcher.createNormalizedWeight(query, true);
|
||||||
|
scorer = ((BooleanWeight) weight).booleanScorer(ctx);
|
||||||
|
assertTrue(scorer instanceof BooleanTopLevelScorers.BoostedBulkScorer);
|
||||||
|
|
||||||
|
w.close();
|
||||||
|
reader.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSparseClauseOptimization() throws IOException {
|
||||||
|
// When some windows have only one scorer that can match, the scorer will
|
||||||
|
// directly call the collector in this window
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||||
|
Document emptyDoc = new Document();
|
||||||
|
final int numDocs = atLeast(10);
|
||||||
|
for (int d = 0; d < numDocs; ++d) {
|
||||||
|
for (int i = random().nextInt(5000); i >= 0; --i) {
|
||||||
|
w.addDocument(emptyDoc);
|
||||||
|
}
|
||||||
|
Document doc = new Document();
|
||||||
|
for (String value : Arrays.asList("foo", "bar", "baz")) {
|
||||||
|
if (random().nextBoolean()) {
|
||||||
|
doc.add(new StringField("field", value, Store.NO));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int i = TestUtil.nextInt(random(), 3000, 5000); i >= 0; --i) {
|
||||||
|
w.addDocument(emptyDoc);
|
||||||
|
}
|
||||||
|
if (random().nextBoolean()) {
|
||||||
|
w.forceMerge(1);
|
||||||
|
}
|
||||||
|
IndexReader reader = w.getReader();
|
||||||
|
IndexSearcher searcher = newSearcher(reader);
|
||||||
|
|
||||||
|
Query query = new BooleanQuery.Builder()
|
||||||
|
.add(new BoostQuery(new TermQuery(new Term("field", "foo")), 3), Occur.SHOULD)
|
||||||
|
.add(new BoostQuery(new TermQuery(new Term("field", "bar")), 3), Occur.SHOULD)
|
||||||
|
.add(new BoostQuery(new TermQuery(new Term("field", "baz")), 3), Occur.SHOULD)
|
||||||
|
.setDisableCoord(random().nextBoolean())
|
||||||
|
.build();
|
||||||
|
|
||||||
|
// duel BS1 vs. BS2
|
||||||
|
QueryUtils.check(random(), query, searcher);
|
||||||
|
|
||||||
|
reader.close();
|
||||||
|
w.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue