mirror of https://github.com/apache/lucene.git
Handle more cases in `BooleanWeight#count`. (#961)
As suggested by @zhaih on #950, we could support more cases in `BooleanWeight#count`. This PR adds support for these cases specifically: - Pure disjunctions where only one clause has a non-zero count. - Pure disjunctions where one clause matches all docs. - Negations where positive clauses match all docs (pure negation). - Negations where positive clauses match no docs. - Negations where negative clauses match no docs. - Negations where negative clauses match all docs.
This commit is contained in:
parent
adcf58fe87
commit
68f77dbf53
|
@ -93,6 +93,9 @@ Optimizations
|
||||||
---------------------
|
---------------------
|
||||||
* LUCENE-8519: MultiDocValues.getNormValues should not call getMergedFieldInfos (Rushabh Shah)
|
* LUCENE-8519: MultiDocValues.getNormValues should not call getMergedFieldInfos (Rushabh Shah)
|
||||||
|
|
||||||
|
* GITHUB#961: BooleanQuery can return quick counts for simple boolean queries.
|
||||||
|
(Adrien Grand)
|
||||||
|
|
||||||
* LUCENE-10618: Implement BooleanQuery rewrite rules based for minimumShouldMatch. (Fang Hou)
|
* LUCENE-10618: Implement BooleanQuery rewrite rules based for minimumShouldMatch. (Fang Hou)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
|
|
|
@ -346,29 +346,50 @@ final class BooleanWeight extends Weight {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int count(LeafReaderContext context) throws IOException {
|
public int count(LeafReaderContext context) throws IOException {
|
||||||
// Implement counting for pure conjunctions in the case when one clause doesn't match any docs,
|
final int numDocs = context.reader().numDocs();
|
||||||
// or all clauses but one match all docs.
|
int positiveCount;
|
||||||
if (weightedClauses.isEmpty()) {
|
if (query.isPureDisjunction()) {
|
||||||
|
positiveCount = optCount(context, Occur.SHOULD);
|
||||||
|
} else if ((query.getClauses(Occur.FILTER).isEmpty() == false
|
||||||
|
|| query.getClauses(Occur.MUST).isEmpty() == false)
|
||||||
|
&& query.getMinimumNumberShouldMatch() == 0) {
|
||||||
|
positiveCount = reqCount(context);
|
||||||
|
} else {
|
||||||
|
// The query has a non-zero min-should match. We could handles some cases, e.g.
|
||||||
|
// minShouldMatch=N and we can find N SHOULD clauses that match all docs, but are there
|
||||||
|
// real-world queries that would benefit from Lucene handling this case?
|
||||||
|
positiveCount = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (positiveCount == 0) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
for (WeightedBooleanClause weightedClause : weightedClauses) {
|
|
||||||
switch (weightedClause.clause.getOccur()) {
|
int prohibitedCount = optCount(context, Occur.MUST_NOT);
|
||||||
case FILTER:
|
if (prohibitedCount == -1) {
|
||||||
case MUST:
|
return -1;
|
||||||
break;
|
} else if (prohibitedCount == 0) {
|
||||||
case MUST_NOT:
|
return positiveCount;
|
||||||
case SHOULD:
|
} else if (prohibitedCount == numDocs) {
|
||||||
default:
|
return 0;
|
||||||
return super.count(context);
|
} else if (positiveCount == numDocs) {
|
||||||
|
return numDocs - prohibitedCount;
|
||||||
|
} else {
|
||||||
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (query.getMinimumNumberShouldMatch() > 0) {
|
|
||||||
return super.count(context);
|
/**
|
||||||
}
|
* Return the number of matches of required clauses, or -1 if unknown, or numDocs if there are no
|
||||||
// From now on we know the query is a pure conjunction
|
* required clauses.
|
||||||
|
*/
|
||||||
|
private int reqCount(LeafReaderContext context) throws IOException {
|
||||||
final int numDocs = context.reader().numDocs();
|
final int numDocs = context.reader().numDocs();
|
||||||
int conjunctionCount = numDocs;
|
int reqCount = numDocs;
|
||||||
for (WeightedBooleanClause weightedClause : weightedClauses) {
|
for (WeightedBooleanClause weightedClause : weightedClauses) {
|
||||||
|
if (weightedClause.clause.isRequired() == false) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
int count = weightedClause.weight.count(context);
|
int count = weightedClause.weight.count(context);
|
||||||
if (count == -1 || count == 0) {
|
if (count == -1 || count == 0) {
|
||||||
// If the count of one clause is unknown, then the count of the conjunction is unknown too.
|
// If the count of one clause is unknown, then the count of the conjunction is unknown too.
|
||||||
|
@ -376,17 +397,49 @@ final class BooleanWeight extends Weight {
|
||||||
return count;
|
return count;
|
||||||
} else if (count == numDocs) {
|
} else if (count == numDocs) {
|
||||||
// the query matches all docs, it can be safely ignored
|
// the query matches all docs, it can be safely ignored
|
||||||
} else if (conjunctionCount == numDocs) {
|
} else if (reqCount == numDocs) {
|
||||||
// all clauses seen so far match all docs, so the count of the new clause is also the count
|
// all clauses seen so far match all docs, so the count of the new clause is also the count
|
||||||
// of the conjunction
|
// of the conjunction
|
||||||
conjunctionCount = count;
|
reqCount = count;
|
||||||
} else {
|
} else {
|
||||||
// We have two clauses whose count is in [1, numDocs), we can't figure out the number of
|
// We have two clauses whose count is in [1, numDocs), we can't figure out the number of
|
||||||
// docs that match the conjunction without running the query.
|
// docs that match the conjunction without running the query.
|
||||||
return super.count(context);
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return conjunctionCount;
|
return reqCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the number of matches of optional clauses, or -1 if unknown, or 0 if there are no
|
||||||
|
* optional clauses.
|
||||||
|
*/
|
||||||
|
private int optCount(LeafReaderContext context, Occur occur) throws IOException {
|
||||||
|
final int numDocs = context.reader().numDocs();
|
||||||
|
int optCount = 0;
|
||||||
|
for (WeightedBooleanClause weightedClause : weightedClauses) {
|
||||||
|
if (weightedClause.clause.getOccur() != occur) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
int count = weightedClause.weight.count(context);
|
||||||
|
if (count == -1 || count == numDocs) {
|
||||||
|
// If any of the clauses has a number of matches that is unknown, the number of matches of
|
||||||
|
// the disjunction is unknown.
|
||||||
|
// If either clause matches all docs, then the disjunction matches all docs.
|
||||||
|
return count;
|
||||||
|
} else if (count == 0) {
|
||||||
|
// We can safely ignore this clause, it doesn't affect the count.
|
||||||
|
} else if (optCount == 0) {
|
||||||
|
// This is the first clause we see that has a non-zero count, it becomes the count of the
|
||||||
|
// disjunction.
|
||||||
|
optCount = count;
|
||||||
|
} else {
|
||||||
|
// We have two clauses whose count is in [1, numDocs), we can't figure out the number of
|
||||||
|
// docs that match the disjunction without running the query.
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return optCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -839,6 +839,195 @@ public class TestBooleanQuery extends LuceneTestCase {
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testDisjunctionMatchesCount() throws IOException {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig());
|
||||||
|
Document doc = new Document();
|
||||||
|
LongPoint longPoint = new LongPoint("long", 3L);
|
||||||
|
doc.add(longPoint);
|
||||||
|
StringField stringField = new StringField("string", "abc", Store.NO);
|
||||||
|
doc.add(stringField);
|
||||||
|
writer.addDocument(doc);
|
||||||
|
longPoint.setLongValue(10);
|
||||||
|
stringField.setStringValue("xyz");
|
||||||
|
writer.addDocument(doc);
|
||||||
|
IndexReader reader = DirectoryReader.open(writer);
|
||||||
|
writer.close();
|
||||||
|
IndexSearcher searcher = new IndexSearcher(reader);
|
||||||
|
|
||||||
|
Query query =
|
||||||
|
new BooleanQuery.Builder()
|
||||||
|
.add(new TermQuery(new Term("string", "abc")), Occur.SHOULD)
|
||||||
|
.add(LongPoint.newExactQuery("long", 3L), Occur.SHOULD)
|
||||||
|
.build();
|
||||||
|
Weight weight = searcher.createWeight(query, ScoreMode.COMPLETE, 1f);
|
||||||
|
// Both queries match a single doc, BooleanWeight can't figure out the count of the disjunction
|
||||||
|
assertEquals(-1, weight.count(reader.leaves().get(0)));
|
||||||
|
|
||||||
|
query =
|
||||||
|
new BooleanQuery.Builder()
|
||||||
|
.add(new TermQuery(new Term("string", "missing")), Occur.SHOULD)
|
||||||
|
.add(LongPoint.newExactQuery("long", 3L), Occur.SHOULD)
|
||||||
|
.build();
|
||||||
|
weight = searcher.createWeight(query, ScoreMode.COMPLETE, 1f);
|
||||||
|
// One query has a count of 0, the disjunction count is the other count
|
||||||
|
assertEquals(1, weight.count(reader.leaves().get(0)));
|
||||||
|
|
||||||
|
query =
|
||||||
|
new BooleanQuery.Builder()
|
||||||
|
.add(new TermQuery(new Term("string", "abc")), Occur.SHOULD)
|
||||||
|
.add(LongPoint.newExactQuery("long", 5L), Occur.SHOULD)
|
||||||
|
.build();
|
||||||
|
weight = searcher.createWeight(query, ScoreMode.COMPLETE, 1f);
|
||||||
|
// One query has a count of 0, the disjunction count is the other count
|
||||||
|
assertEquals(1, weight.count(reader.leaves().get(0)));
|
||||||
|
|
||||||
|
query =
|
||||||
|
new BooleanQuery.Builder()
|
||||||
|
.add(new TermQuery(new Term("string", "abc")), Occur.SHOULD)
|
||||||
|
.add(LongPoint.newRangeQuery("long", 0L, 10L), Occur.SHOULD)
|
||||||
|
.build();
|
||||||
|
// One query matches all docs, the count of the disjunction is the number of docs
|
||||||
|
weight = searcher.createWeight(query, ScoreMode.COMPLETE, 1f);
|
||||||
|
assertEquals(2, weight.count(reader.leaves().get(0)));
|
||||||
|
|
||||||
|
query =
|
||||||
|
new BooleanQuery.Builder()
|
||||||
|
.add(new MatchAllDocsQuery(), Occur.SHOULD)
|
||||||
|
.add(LongPoint.newRangeQuery("long", 1L, 5L), Occur.SHOULD)
|
||||||
|
.build();
|
||||||
|
// One query matches all docs, the count of the disjunction is the number of docs
|
||||||
|
weight = searcher.createWeight(query, ScoreMode.COMPLETE, 1f);
|
||||||
|
assertEquals(2, weight.count(reader.leaves().get(0)));
|
||||||
|
|
||||||
|
reader.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testProhibitedMatchesCount() throws IOException {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig());
|
||||||
|
Document doc = new Document();
|
||||||
|
LongPoint longPoint = new LongPoint("long", 3L);
|
||||||
|
doc.add(longPoint);
|
||||||
|
StringField stringField = new StringField("string", "abc", Store.NO);
|
||||||
|
doc.add(stringField);
|
||||||
|
writer.addDocument(doc);
|
||||||
|
longPoint.setLongValue(10);
|
||||||
|
stringField.setStringValue("xyz");
|
||||||
|
writer.addDocument(doc);
|
||||||
|
IndexReader reader = DirectoryReader.open(writer);
|
||||||
|
writer.close();
|
||||||
|
IndexSearcher searcher = new IndexSearcher(reader);
|
||||||
|
|
||||||
|
Query query =
|
||||||
|
new BooleanQuery.Builder()
|
||||||
|
.add(new TermQuery(new Term("string", "abc")), Occur.MUST)
|
||||||
|
.add(LongPoint.newExactQuery("long", 3L), Occur.MUST_NOT)
|
||||||
|
.build();
|
||||||
|
Weight weight = searcher.createWeight(query, ScoreMode.COMPLETE, 1f);
|
||||||
|
// Both queries match a single doc, BooleanWeight can't figure out the count of the query
|
||||||
|
assertEquals(-1, weight.count(reader.leaves().get(0)));
|
||||||
|
|
||||||
|
query =
|
||||||
|
new BooleanQuery.Builder()
|
||||||
|
.add(new TermQuery(new Term("string", "missing")), Occur.MUST)
|
||||||
|
.add(LongPoint.newExactQuery("long", 3L), Occur.MUST_NOT)
|
||||||
|
.build();
|
||||||
|
weight = searcher.createWeight(query, ScoreMode.COMPLETE, 1f);
|
||||||
|
// the positive clause doesn't match any docs, so the overall query doesn't either
|
||||||
|
assertEquals(0, weight.count(reader.leaves().get(0)));
|
||||||
|
|
||||||
|
query =
|
||||||
|
new BooleanQuery.Builder()
|
||||||
|
.add(new TermQuery(new Term("string", "abc")), Occur.MUST)
|
||||||
|
.add(LongPoint.newExactQuery("long", 5L), Occur.MUST_NOT)
|
||||||
|
.build();
|
||||||
|
weight = searcher.createWeight(query, ScoreMode.COMPLETE, 1f);
|
||||||
|
// the negative clause doesn't match any docs, so the overall count is the count of the positive
|
||||||
|
// clause
|
||||||
|
assertEquals(1, weight.count(reader.leaves().get(0)));
|
||||||
|
|
||||||
|
query =
|
||||||
|
new BooleanQuery.Builder()
|
||||||
|
.add(new TermQuery(new Term("string", "abc")), Occur.MUST)
|
||||||
|
.add(LongPoint.newRangeQuery("long", 0L, 10L), Occur.MUST_NOT)
|
||||||
|
.build();
|
||||||
|
// the negative clause matches all docs, so the query doesn't match any docs
|
||||||
|
weight = searcher.createWeight(query, ScoreMode.COMPLETE, 1f);
|
||||||
|
assertEquals(0, weight.count(reader.leaves().get(0)));
|
||||||
|
|
||||||
|
query =
|
||||||
|
new BooleanQuery.Builder()
|
||||||
|
.add(LongPoint.newRangeQuery("long", 0L, 10L), Occur.MUST)
|
||||||
|
.add(new TermQuery(new Term("string", "abc")), Occur.MUST_NOT)
|
||||||
|
.build();
|
||||||
|
// The positive clause matches all docs, so we can subtract the number of matches of the
|
||||||
|
// negative clause
|
||||||
|
weight = searcher.createWeight(query, ScoreMode.COMPLETE, 1f);
|
||||||
|
assertEquals(1, weight.count(reader.leaves().get(0)));
|
||||||
|
|
||||||
|
reader.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRandomBooleanQueryMatchesCount() throws IOException {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig());
|
||||||
|
Document doc = new Document();
|
||||||
|
LongPoint longPoint = new LongPoint("long", 3L);
|
||||||
|
doc.add(longPoint);
|
||||||
|
StringField stringField = new StringField("string", "abc", Store.NO);
|
||||||
|
doc.add(stringField);
|
||||||
|
writer.addDocument(doc);
|
||||||
|
longPoint.setLongValue(10);
|
||||||
|
stringField.setStringValue("xyz");
|
||||||
|
writer.addDocument(doc);
|
||||||
|
IndexReader reader = DirectoryReader.open(writer);
|
||||||
|
writer.close();
|
||||||
|
IndexSearcher searcher = new IndexSearcher(reader);
|
||||||
|
for (int iter = 0; iter < 1000; ++iter) {
|
||||||
|
final int numClauses = TestUtil.nextInt(random(), 2, 5);
|
||||||
|
BooleanQuery.Builder builder = new BooleanQuery.Builder();
|
||||||
|
int numShouldClauses = 0;
|
||||||
|
for (int i = 0; i < numClauses; ++i) {
|
||||||
|
Query query;
|
||||||
|
switch (random().nextInt(6)) {
|
||||||
|
case 0:
|
||||||
|
query = new TermQuery(new Term("string", "abc"));
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
query = LongPoint.newExactQuery("long", 3L);
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
query = new TermQuery(new Term("string", "missing"));
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
query = LongPoint.newExactQuery("long", 5L);
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
query = new MatchAllDocsQuery();
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
query = LongPoint.newRangeQuery("long", 0L, 10L);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
Occur occur = RandomPicks.randomFrom(random(), Occur.values());
|
||||||
|
if (occur == Occur.SHOULD) {
|
||||||
|
numShouldClauses++;
|
||||||
|
}
|
||||||
|
builder.add(query, occur);
|
||||||
|
}
|
||||||
|
builder.setMinimumNumberShouldMatch(TestUtil.nextInt(random(), 0, numShouldClauses));
|
||||||
|
Query booleanQuery = builder.build();
|
||||||
|
assertEquals(
|
||||||
|
(int) searcher.search(booleanQuery, new TotalHitCountCollectorManager()),
|
||||||
|
searcher.count(booleanQuery));
|
||||||
|
}
|
||||||
|
reader.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
public void testToString() {
|
public void testToString() {
|
||||||
BooleanQuery.Builder bq = new BooleanQuery.Builder();
|
BooleanQuery.Builder bq = new BooleanQuery.Builder();
|
||||||
bq.add(new TermQuery(new Term("field", "a")), Occur.SHOULD);
|
bq.add(new TermQuery(new Term("field", "a")), Occur.SHOULD);
|
||||||
|
|
Loading…
Reference in New Issue