LUCENE-6940: Speed up MUST_NOT clauses.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1722443 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2015-12-31 13:41:19 +00:00
parent fd765b6252
commit e773508648
7 changed files with 368 additions and 53 deletions

View File

@ -156,6 +156,9 @@ Optimizations
* LUCENE-6944: BooleanWeight no longer creates sub-scorers if BS1 is not
applicable. (Adrien Grand)
* LUCENE-6940: MUST_NOT clauses execute faster, especially when they are sparse.
(Adrien Grand)
Bug Fixes
* LUCENE-6918: LRUQueryCache.onDocIdSetEviction is only called when at least

View File

@ -37,35 +37,6 @@ final class BooleanScorer extends BulkScorer {
static final int SET_SIZE = 1 << (SHIFT - 6);
static final int SET_MASK = SET_SIZE - 1;
private static BulkScorer disableScoring(final BulkScorer scorer) {
return new BulkScorer() {
@Override
public int score(final LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
final LeafCollector noScoreCollector = new LeafCollector() {
FakeScorer fake = new FakeScorer();
@Override
public void setScorer(Scorer scorer) throws IOException {
collector.setScorer(fake);
}
@Override
public void collect(int doc) throws IOException {
fake.doc = doc;
collector.collect(doc);
}
};
return scorer.score(noScoreCollector, acceptDocs, min, max);
}
@Override
public long cost() {
return scorer.cost();
}
};
}
static class Bucket {
double score;
int freq;
@ -193,7 +164,7 @@ final class BooleanScorer extends BulkScorer {
if (needsScores == false) {
// OrCollector calls score() all the time so we have to explicitly
// disable scoring in order to avoid decoding useless norms
scorer = disableScoring(scorer);
scorer = BooleanWeight.disableScoring(scorer);
}
final BulkScorerAndDoc evicted = tail.insertWithOverflow(new BulkScorerAndDoc(scorer));
if (evicted != null) {

View File

@ -28,6 +28,7 @@ import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.Bits;
/**
* Expert: the Weight for BooleanQuery, used to
@ -187,27 +188,45 @@ final class BooleanWeight extends Weight {
}
}
/** Try to build a boolean scorer for this weight. Returns null if {@link BooleanScorer}
* cannot be used. */
// pkg-private for forcing use of BooleanScorer in tests
BulkScorer booleanScorer(LeafReaderContext context) throws IOException {
if (query.getClauses(Occur.MUST).isEmpty() == false
|| query.getClauses(Occur.FILTER).isEmpty() == false) {
// TODO: there are some cases where BooleanScorer
// would handle conjunctions faster than
// BooleanScorer2...
return null;
} else if (query.getClauses(Occur.MUST_NOT).isEmpty() == false) {
// TODO: there are some cases where BooleanScorer could do this faster
return null;
static BulkScorer disableScoring(final BulkScorer scorer) {
return new BulkScorer() {
@Override
public int score(final LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
final LeafCollector noScoreCollector = new LeafCollector() {
FakeScorer fake = new FakeScorer();
@Override
public void setScorer(Scorer scorer) throws IOException {
collector.setScorer(fake);
}
@Override
public void collect(int doc) throws IOException {
fake.doc = doc;
collector.collect(doc);
}
};
return scorer.score(noScoreCollector, acceptDocs, min, max);
}
@Override
public long cost() {
return scorer.cost();
}
};
}
// Return a BulkScorer for the optional clauses only,
// or null if it is not applicable
// pkg-private for forcing use of BooleanScorer in tests
BulkScorer optionalBulkScorer(LeafReaderContext context) throws IOException {
List<BulkScorer> optional = new ArrayList<BulkScorer>();
Iterator<BooleanClause> cIter = query.iterator();
for (Weight w : weights) {
BooleanClause c = cIter.next();
if (c.getOccur() != Occur.SHOULD) {
throw new AssertionError();
continue;
}
BulkScorer subScorer = w.bulkScorer(context);
@ -236,10 +255,50 @@ final class BooleanWeight extends Weight {
return new BooleanScorer(this, disableCoord, maxCoord, optional, Math.max(1, query.getMinimumNumberShouldMatch()), needsScores);
}
@Override
public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
final BulkScorer bulkScorer = booleanScorer(context);
if (bulkScorer != null) { // BooleanScorer is applicable
// Return a BulkScorer for the required clauses only,
// or null if it is not applicable
private BulkScorer requiredBulkScorer(LeafReaderContext context) throws IOException {
BulkScorer scorer = null;
Iterator<BooleanClause> cIter = query.iterator();
for (Weight w : weights) {
BooleanClause c = cIter.next();
if (c.isRequired() == false) {
continue;
}
if (scorer != null) {
// we don't have a BulkScorer for conjunctions
return null;
}
scorer = w.bulkScorer(context);
if (scorer == null) {
// no matches
return null;
}
if (c.isScoring() == false) {
if (needsScores) {
scorer = disableScoring(scorer);
}
} else {
assert maxCoord == 1;
}
}
return scorer;
}
/** Try to build a boolean scorer for this weight. Returns null if {@link BooleanScorer}
* cannot be used. */
BulkScorer booleanScorer(LeafReaderContext context) throws IOException {
final int numOptionalClauses = query.getClauses(Occur.SHOULD).size();
final int numRequiredClauses = query.getClauses(Occur.MUST).size() + query.getClauses(Occur.FILTER).size();
BulkScorer positiveScorer;
if (numRequiredClauses == 0) {
positiveScorer = optionalBulkScorer(context);
if (positiveScorer == null) {
return null;
}
// TODO: what is the right heuristic here?
final long costThreshold;
if (query.getMinimumNumberShouldMatch() <= 1) {
@ -257,12 +316,60 @@ final class BooleanWeight extends Weight {
costThreshold = context.reader().maxDoc() / 3;
}
if (bulkScorer.cost() > costThreshold) {
if (positiveScorer.cost() < costThreshold) {
return null;
}
} else if (numRequiredClauses == 1
&& numOptionalClauses == 0
&& query.getMinimumNumberShouldMatch() == 0) {
positiveScorer = requiredBulkScorer(context);
} else {
// TODO: there are some cases where BooleanScorer
// would handle conjunctions faster than
// BooleanScorer2...
return null;
}
if (positiveScorer == null) {
return null;
}
List<Scorer> prohibited = new ArrayList<>();
Iterator<BooleanClause> cIter = query.iterator();
for (Weight w : weights) {
BooleanClause c = cIter.next();
if (c.isProhibited()) {
Scorer scorer = w.scorer(context);
if (scorer != null) {
prohibited.add(scorer);
}
}
}
if (prohibited.isEmpty()) {
return positiveScorer;
} else {
Scorer prohibitedScorer = opt(prohibited, 1, true);
if (prohibitedScorer.twoPhaseIterator() != null) {
// ReqExclBulkScorer can't deal efficiently with two-phased prohibited clauses
return null;
}
return new ReqExclBulkScorer(positiveScorer, prohibitedScorer.iterator());
}
}
@Override
public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
final BulkScorer bulkScorer = booleanScorer(context);
if (bulkScorer != null) {
// bulk scoring is applicable, use it
return bulkScorer;
}
}
} else {
// use a Scorer-based impl (BS2)
return super.bulkScorer(context);
}
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {

View File

@ -0,0 +1,64 @@
package org.apache.lucene.search;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.util.Bits;
final class ReqExclBulkScorer extends BulkScorer {
private final BulkScorer req;
private final DocIdSetIterator excl;
ReqExclBulkScorer(BulkScorer req, DocIdSetIterator excl) {
this.req = req;
this.excl = excl;
}
@Override
public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
int upTo = min;
int exclDoc = excl.docID();
while (upTo < max) {
if (exclDoc < upTo) {
exclDoc = excl.advance(upTo);
}
if (exclDoc == upTo) {
// upTo is excluded so we can consider that we scored up to upTo+1
upTo += 1;
exclDoc = excl.nextDoc();
} else {
upTo = req.score(collector, acceptDocs, upTo, Math.min(exclDoc, max));
}
}
if (upTo == max) {
upTo = req.score(collector, acceptDocs, upTo, upTo);
}
return upTo;
}
@Override
public long cost() {
return req.cost();
}
}

View File

@ -196,6 +196,60 @@ public class TestBooleanScorer extends LuceneTestCase {
dir.close();
}
public void testOptimizeProhibitedClauses() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(new StringField("foo", "bar", Store.NO));
doc.add(new StringField("foo", "baz", Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(new StringField("foo", "baz", Store.NO));
w.addDocument(doc);
w.forceMerge(1);
IndexReader reader = w.getReader();
IndexSearcher searcher = new IndexSearcher(reader);
searcher.setQueryCache(null); // so that weights are not wrapped
final LeafReaderContext ctx = reader.leaves().get(0);
Query query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "baz")), Occur.SHOULD)
.add(new TermQuery(new Term("foo", "bar")), Occur.MUST_NOT)
.build();
Weight weight = searcher.createNormalizedWeight(query, true);
BulkScorer scorer = ((BooleanWeight) weight).booleanScorer(ctx);
assertTrue(scorer instanceof ReqExclBulkScorer);
query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "baz")), Occur.SHOULD)
.add(new MatchAllDocsQuery(), Occur.SHOULD)
.add(new TermQuery(new Term("foo", "bar")), Occur.MUST_NOT)
.build();
weight = searcher.createNormalizedWeight(query, true);
scorer = ((BooleanWeight) weight).booleanScorer(ctx);
assertTrue(scorer instanceof ReqExclBulkScorer);
query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "baz")), Occur.MUST)
.add(new TermQuery(new Term("foo", "bar")), Occur.MUST_NOT)
.build();
weight = searcher.createNormalizedWeight(query, true);
scorer = ((BooleanWeight) weight).booleanScorer(ctx);
assertTrue(scorer instanceof ReqExclBulkScorer);
query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "baz")), Occur.FILTER)
.add(new TermQuery(new Term("foo", "bar")), Occur.MUST_NOT)
.build();
weight = searcher.createNormalizedWeight(query, true);
scorer = ((BooleanWeight) weight).booleanScorer(ctx);
assertTrue(scorer instanceof ReqExclBulkScorer);
w.close();
reader.close();
dir.close();
}
public void testSparseClauseOptimization() throws IOException {
// When some windows have only one scorer that can match, the scorer will
// directly call the collector in this window

View File

@ -132,7 +132,7 @@ public class TestMinShouldMatch2 extends LuceneTestCase {
case SCORER:
return weight.scorer(reader.getContext());
case BULK_SCORER:
final BulkScorer bulkScorer = weight.booleanScorer(reader.getContext());
final BulkScorer bulkScorer = weight.optionalBulkScorer(reader.getContext());
if (bulkScorer == null) {
if (weight.scorer(reader.getContext()) != null) {
throw new AssertionError("BooleanScorer should be applicable for this query");

View File

@ -0,0 +1,116 @@
package org.apache.lucene.search;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.DocIdSetBuilder;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
public class TestReqExclBulkScorer extends LuceneTestCase {
public void testRandom() throws IOException {
final int iters = atLeast(10);
for (int iter = 0; iter < iters; ++iter) {
doTestRandom();
}
}
public void doTestRandom() throws IOException {
final int maxDoc = TestUtil.nextInt(random(), 1, 1000);
DocIdSetBuilder reqBuilder = new DocIdSetBuilder(maxDoc);
DocIdSetBuilder exclBuilder = new DocIdSetBuilder(maxDoc);
final int numIncludedDocs = TestUtil.nextInt(random(), 1, maxDoc);
final int numExcludedDocs = TestUtil.nextInt(random(), 1, maxDoc);
for (int i = 0; i < numIncludedDocs; ++i) {
reqBuilder.add(random().nextInt(maxDoc));
}
for (int i = 0; i < numExcludedDocs; ++i) {
exclBuilder.add(random().nextInt(maxDoc));
}
final DocIdSet req = reqBuilder.build();
final DocIdSet excl = exclBuilder.build();
final BulkScorer reqBulkScorer = new BulkScorer() {
final DocIdSetIterator iterator = req.iterator();
@Override
public int score(LeafCollector collector, Bits acceptDocs, int min, int max) throws IOException {
int doc = iterator.docID();
if (iterator.docID() < min) {
doc = iterator.advance(min);
}
while (doc < max) {
if (acceptDocs == null || acceptDocs.get(doc)) {
collector.collect(doc);
}
doc = iterator.nextDoc();
}
return doc;
}
@Override
public long cost() {
return iterator.cost();
}
};
ReqExclBulkScorer reqExcl = new ReqExclBulkScorer(reqBulkScorer, excl.iterator());
final FixedBitSet actualMatches = new FixedBitSet(maxDoc);
if (random().nextBoolean()) {
reqExcl.score(new LeafCollector() {
@Override
public void setScorer(Scorer scorer) throws IOException {}
@Override
public void collect(int doc) throws IOException {
actualMatches.set(doc);
}
}, null);
} else {
int next = 0;
while (next < maxDoc) {
final int min = next;
final int max = min + random().nextInt(10);
next = reqExcl.score(new LeafCollector() {
@Override
public void setScorer(Scorer scorer) throws IOException {}
@Override
public void collect(int doc) throws IOException {
actualMatches.set(doc);
}
}, null, min, max);
assertTrue(next >= max);
}
}
final FixedBitSet expectedMatches = new FixedBitSet(maxDoc);
expectedMatches.or(req.iterator());
FixedBitSet excludedSet = new FixedBitSet(maxDoc);
excludedSet.or(excl.iterator());
expectedMatches.andNot(excludedSet);
assertArrayEquals(expectedMatches.getBits(), actualMatches.getBits());
}
}