LUCENE-7925: Collapse duplicate SHOULD or MUST clauses by summing up their boosts.

This commit is contained in:
Adrien Grand 2017-08-21 11:38:08 +02:00
parent bb9b82f98b
commit ab793e7ab5
4 changed files with 124 additions and 3 deletions

View File

@ -33,6 +33,9 @@ Optimizations
than 8x greater than the cost of the lead iterator in order to use doc values. than 8x greater than the cost of the lead iterator in order to use doc values.
(Murali Krishna P via Adrien Grand) (Murali Krishna P via Adrien Grand)
* LUCENE-7925: Collapse duplicate SHOULD or MUST clauses by summing up their
boosts. (Adrien Grand)
Bug Fixes Bug Fixes
* LUCENE-7916: Prevent ArrayIndexOutOfBoundsException if ICUTokenizer is used * LUCENE-7916: Prevent ArrayIndexOutOfBoundsException if ICUTokenizer is used

View File

@ -23,6 +23,7 @@ import java.util.Arrays;
import java.util.Collection; import java.util.Collection;
import java.util.Collections; import java.util.Collections;
import java.util.EnumMap; import java.util.EnumMap;
import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
@ -332,6 +333,69 @@ public class BooleanQuery extends Query implements Iterable<BooleanClause> {
} }
} }
// Deduplicate SHOULD clauses by summing up their boosts
if (clauseSets.get(Occur.SHOULD).size() > 0 && minimumNumberShouldMatch <= 1) {
Map<Query, Double> shouldClauses = new HashMap<>();
for (Query query : clauseSets.get(Occur.SHOULD)) {
double boost = 1;
while (query instanceof BoostQuery) {
BoostQuery bq = (BoostQuery) query;
boost *= bq.getBoost();
query = bq.getQuery();
}
shouldClauses.put(query, shouldClauses.getOrDefault(query, 0d) + boost);
}
if (shouldClauses.size() != clauseSets.get(Occur.SHOULD).size()) {
BooleanQuery.Builder builder = new BooleanQuery.Builder()
.setMinimumNumberShouldMatch(minimumNumberShouldMatch);
for (Map.Entry<Query,Double> entry : shouldClauses.entrySet()) {
Query query = entry.getKey();
float boost = entry.getValue().floatValue();
if (boost != 1f) {
query = new BoostQuery(query, boost);
}
builder.add(query, Occur.SHOULD);
}
for (BooleanClause clause : clauses) {
if (clause.getOccur() != Occur.SHOULD) {
builder.add(clause);
}
}
return builder.build();
}
}
// Deduplicate MUST clauses by summing up their boosts
if (clauseSets.get(Occur.MUST).size() > 0) {
Map<Query, Double> mustClauses = new HashMap<>();
for (Query query : clauseSets.get(Occur.MUST)) {
double boost = 1;
while (query instanceof BoostQuery) {
BoostQuery bq = (BoostQuery) query;
boost *= bq.getBoost();
query = bq.getQuery();
}
mustClauses.put(query, mustClauses.getOrDefault(query, 0d) + boost);
}
if (mustClauses.size() != clauseSets.get(Occur.MUST).size()) {
BooleanQuery.Builder builder = new BooleanQuery.Builder()
.setMinimumNumberShouldMatch(minimumNumberShouldMatch);
for (Map.Entry<Query,Double> entry : mustClauses.entrySet()) {
Query query = entry.getKey();
float boost = entry.getValue().floatValue();
if (boost != 1f) {
query = new BoostQuery(query, boost);
}
builder.add(query, Occur.MUST);
}
for (BooleanClause clause : clauses) {
if (clause.getOccur() != Occur.MUST) {
builder.add(clause);
}
}
return builder.build();
}
}
// Rewrite queries whose single scoring clause is a MUST clause on a // Rewrite queries whose single scoring clause is a MUST clause on a
// MatchAllDocsQuery to a ConstantScoreQuery // MatchAllDocsQuery to a ConstantScoreQuery

View File

@ -427,4 +427,57 @@ public class TestBooleanRewrites extends LuceneTestCase {
assertEquals(expectedScore, actualScore, expectedScore / 100); // error under 1% assertEquals(expectedScore, actualScore, expectedScore / 100); // error under 1%
} }
} }
public void testDeduplicateShouldClauses() throws IOException {
IndexSearcher searcher = newSearcher(new MultiReader());
Query query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.SHOULD)
.add(new TermQuery(new Term("foo", "bar")), Occur.SHOULD)
.build();
Query expected = new BoostQuery(new TermQuery(new Term("foo", "bar")), 2);
assertEquals(expected, searcher.rewrite(query));
query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.SHOULD)
.add(new BoostQuery(new TermQuery(new Term("foo", "bar")), 2), Occur.SHOULD)
.add(new TermQuery(new Term("foo", "quux")), Occur.SHOULD)
.build();
expected = new BooleanQuery.Builder()
.add(new BoostQuery(new TermQuery(new Term("foo", "bar")), 3), Occur.SHOULD)
.add(new TermQuery(new Term("foo", "quux")), Occur.SHOULD)
.build();
assertEquals(expected, searcher.rewrite(query));
query = new BooleanQuery.Builder()
.setMinimumNumberShouldMatch(2)
.add(new TermQuery(new Term("foo", "bar")), Occur.SHOULD)
.add(new TermQuery(new Term("foo", "bar")), Occur.SHOULD)
.add(new TermQuery(new Term("foo", "quux")), Occur.SHOULD)
.build();
expected = query;
assertEquals(expected, searcher.rewrite(query));
}
public void testDeduplicateMustClauses() throws IOException {
IndexSearcher searcher = newSearcher(new MultiReader());
Query query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.MUST)
.add(new TermQuery(new Term("foo", "bar")), Occur.MUST)
.build();
Query expected = new BoostQuery(new TermQuery(new Term("foo", "bar")), 2);
assertEquals(expected, searcher.rewrite(query));
query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.MUST)
.add(new BoostQuery(new TermQuery(new Term("foo", "bar")), 2), Occur.MUST)
.add(new TermQuery(new Term("foo", "quux")), Occur.MUST)
.build();
expected = new BooleanQuery.Builder()
.add(new BoostQuery(new TermQuery(new Term("foo", "bar")), 3), Occur.MUST)
.add(new TermQuery(new Term("foo", "quux")), Occur.MUST)
.build();
assertEquals(expected, searcher.rewrite(query));
}
} }

View File

@ -90,7 +90,8 @@ public class TestConstantScoreQuery extends LuceneTestCase {
RandomIndexWriter writer = new RandomIndexWriter (random(), directory); RandomIndexWriter writer = new RandomIndexWriter (random(), directory);
Document doc = new Document(); Document doc = new Document();
doc.add(newStringField("field", "term", Field.Store.NO)); doc.add(newStringField("field", "term1", Field.Store.NO));
doc.add(newStringField("field", "term2", Field.Store.NO));
writer.addDocument(doc); writer.addDocument(doc);
reader = writer.getReader(); reader = writer.getReader();
@ -99,8 +100,8 @@ public class TestConstantScoreQuery extends LuceneTestCase {
searcher = newSearcher(reader, true, false); searcher = newSearcher(reader, true, false);
searcher.setQueryCache(null); // to assert on scorer impl searcher.setQueryCache(null); // to assert on scorer impl
final BoostQuery csq1 = new BoostQuery(new ConstantScoreQuery(new TermQuery(new Term ("field", "term"))), 2f); final BoostQuery csq1 = new BoostQuery(new ConstantScoreQuery(new TermQuery(new Term ("field", "term1"))), 2f);
final BoostQuery csq2 = new BoostQuery(new ConstantScoreQuery(csq1), 5f); final BoostQuery csq2 = new BoostQuery(new ConstantScoreQuery(new ConstantScoreQuery(new TermQuery(new Term ("field", "term2")))), 5f);
final BooleanQuery.Builder bq = new BooleanQuery.Builder(); final BooleanQuery.Builder bq = new BooleanQuery.Builder();
bq.add(csq1, BooleanClause.Occur.SHOULD); bq.add(csq1, BooleanClause.Occur.SHOULD);