LUCENE-7925: Collapse duplicate SHOULD or MUST clauses by summing up their boosts.

This commit is contained in:
Adrien Grand 2017-08-21 11:38:08 +02:00
parent bb9b82f98b
commit ab793e7ab5
4 changed files with 124 additions and 3 deletions

View File

@ -33,6 +33,9 @@ Optimizations
than 8x greater than the cost of the lead iterator in order to use doc values.
(Murali Krishna P via Adrien Grand)
* LUCENE-7925: Collapse duplicate SHOULD or MUST clauses by summing up their
boosts. (Adrien Grand)
Bug Fixes
* LUCENE-7916: Prevent ArrayIndexOutOfBoundsException if ICUTokenizer is used

View File

@ -23,6 +23,7 @@ import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
@ -332,6 +333,69 @@ public class BooleanQuery extends Query implements Iterable<BooleanClause> {
}
}
// Deduplicate SHOULD clauses by summing up their boosts
if (clauseSets.get(Occur.SHOULD).size() > 0 && minimumNumberShouldMatch <= 1) {
Map<Query, Double> shouldClauses = new HashMap<>();
for (Query query : clauseSets.get(Occur.SHOULD)) {
double boost = 1;
while (query instanceof BoostQuery) {
BoostQuery bq = (BoostQuery) query;
boost *= bq.getBoost();
query = bq.getQuery();
}
shouldClauses.put(query, shouldClauses.getOrDefault(query, 0d) + boost);
}
if (shouldClauses.size() != clauseSets.get(Occur.SHOULD).size()) {
BooleanQuery.Builder builder = new BooleanQuery.Builder()
.setMinimumNumberShouldMatch(minimumNumberShouldMatch);
for (Map.Entry<Query,Double> entry : shouldClauses.entrySet()) {
Query query = entry.getKey();
float boost = entry.getValue().floatValue();
if (boost != 1f) {
query = new BoostQuery(query, boost);
}
builder.add(query, Occur.SHOULD);
}
for (BooleanClause clause : clauses) {
if (clause.getOccur() != Occur.SHOULD) {
builder.add(clause);
}
}
return builder.build();
}
}
// Deduplicate MUST clauses by summing up their boosts
if (clauseSets.get(Occur.MUST).size() > 0) {
Map<Query, Double> mustClauses = new HashMap<>();
for (Query query : clauseSets.get(Occur.MUST)) {
double boost = 1;
while (query instanceof BoostQuery) {
BoostQuery bq = (BoostQuery) query;
boost *= bq.getBoost();
query = bq.getQuery();
}
mustClauses.put(query, mustClauses.getOrDefault(query, 0d) + boost);
}
if (mustClauses.size() != clauseSets.get(Occur.MUST).size()) {
BooleanQuery.Builder builder = new BooleanQuery.Builder()
.setMinimumNumberShouldMatch(minimumNumberShouldMatch);
for (Map.Entry<Query,Double> entry : mustClauses.entrySet()) {
Query query = entry.getKey();
float boost = entry.getValue().floatValue();
if (boost != 1f) {
query = new BoostQuery(query, boost);
}
builder.add(query, Occur.MUST);
}
for (BooleanClause clause : clauses) {
if (clause.getOccur() != Occur.MUST) {
builder.add(clause);
}
}
return builder.build();
}
}
// Rewrite queries whose single scoring clause is a MUST clause on a
// MatchAllDocsQuery to a ConstantScoreQuery

View File

@ -427,4 +427,57 @@ public class TestBooleanRewrites extends LuceneTestCase {
assertEquals(expectedScore, actualScore, expectedScore / 100); // error under 1%
}
}
public void testDeduplicateShouldClauses() throws IOException {
IndexSearcher searcher = newSearcher(new MultiReader());
Query query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.SHOULD)
.add(new TermQuery(new Term("foo", "bar")), Occur.SHOULD)
.build();
Query expected = new BoostQuery(new TermQuery(new Term("foo", "bar")), 2);
assertEquals(expected, searcher.rewrite(query));
query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.SHOULD)
.add(new BoostQuery(new TermQuery(new Term("foo", "bar")), 2), Occur.SHOULD)
.add(new TermQuery(new Term("foo", "quux")), Occur.SHOULD)
.build();
expected = new BooleanQuery.Builder()
.add(new BoostQuery(new TermQuery(new Term("foo", "bar")), 3), Occur.SHOULD)
.add(new TermQuery(new Term("foo", "quux")), Occur.SHOULD)
.build();
assertEquals(expected, searcher.rewrite(query));
query = new BooleanQuery.Builder()
.setMinimumNumberShouldMatch(2)
.add(new TermQuery(new Term("foo", "bar")), Occur.SHOULD)
.add(new TermQuery(new Term("foo", "bar")), Occur.SHOULD)
.add(new TermQuery(new Term("foo", "quux")), Occur.SHOULD)
.build();
expected = query;
assertEquals(expected, searcher.rewrite(query));
}
public void testDeduplicateMustClauses() throws IOException {
IndexSearcher searcher = newSearcher(new MultiReader());
Query query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.MUST)
.add(new TermQuery(new Term("foo", "bar")), Occur.MUST)
.build();
Query expected = new BoostQuery(new TermQuery(new Term("foo", "bar")), 2);
assertEquals(expected, searcher.rewrite(query));
query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.MUST)
.add(new BoostQuery(new TermQuery(new Term("foo", "bar")), 2), Occur.MUST)
.add(new TermQuery(new Term("foo", "quux")), Occur.MUST)
.build();
expected = new BooleanQuery.Builder()
.add(new BoostQuery(new TermQuery(new Term("foo", "bar")), 3), Occur.MUST)
.add(new TermQuery(new Term("foo", "quux")), Occur.MUST)
.build();
assertEquals(expected, searcher.rewrite(query));
}
}

View File

@ -90,7 +90,8 @@ public class TestConstantScoreQuery extends LuceneTestCase {
RandomIndexWriter writer = new RandomIndexWriter (random(), directory);
Document doc = new Document();
doc.add(newStringField("field", "term", Field.Store.NO));
doc.add(newStringField("field", "term1", Field.Store.NO));
doc.add(newStringField("field", "term2", Field.Store.NO));
writer.addDocument(doc);
reader = writer.getReader();
@ -99,8 +100,8 @@ public class TestConstantScoreQuery extends LuceneTestCase {
searcher = newSearcher(reader, true, false);
searcher.setQueryCache(null); // to assert on scorer impl
final BoostQuery csq1 = new BoostQuery(new ConstantScoreQuery(new TermQuery(new Term ("field", "term"))), 2f);
final BoostQuery csq2 = new BoostQuery(new ConstantScoreQuery(csq1), 5f);
final BoostQuery csq1 = new BoostQuery(new ConstantScoreQuery(new TermQuery(new Term ("field", "term1"))), 2f);
final BoostQuery csq2 = new BoostQuery(new ConstantScoreQuery(new ConstantScoreQuery(new TermQuery(new Term ("field", "term2")))), 5f);
final BooleanQuery.Builder bq = new BooleanQuery.Builder();
bq.add(csq1, BooleanClause.Occur.SHOULD);