Speed up top-k retrieval on filtered conjunctions. (#13994)

A while back we added an optimized bulk scorer that implements block-max AND,
this yielded a good speedup on nightly benchmarks, see annotation `FP` at
https://benchmarks.mikemccandless.com/AndHighHigh.html. With this PR, filtered
conjunctions now also run through this optimized bulk scorer by doing two
things:
 - It flattens inner conjunctions. This makes queries initially written as
   something like `+(+term1 +term2) #filter` rewritten to
   `+term1 +term2 #filter`.
 - It evaluates queries that have a mix of MUST and FILTER clauses evaluated
   through `BlockMaxConjunctionBulkScorer` by treating FILTER clauses as
   scoring clauses that produce a score of 0.
This commit is contained in:
Adrien Grand 2024-11-18 08:51:35 +01:00 committed by GitHub
parent a0e1eeefeb
commit 4400d55297
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 175 additions and 24 deletions

View File

@ -93,6 +93,9 @@ Optimizations
longs that would pack two integers. We are now moving back to integers to be
able to take advantage of 2x more lanes with the vector API. (Adrien Grand)
* GITHUB#13994: Speed up top-k retrieval of filtered conjunctions.
(Adrien Grand)
Bug Fixes
---------------------
* GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended

View File

@ -268,6 +268,11 @@ public class BooleanQuery extends Query implements Iterable<BooleanClause> {
return new MatchNoDocsQuery("empty BooleanQuery");
}
// Queries with no positive clauses have no matches
if (clauses.size() == clauseSets.get(Occur.MUST_NOT).size()) {
return new MatchNoDocsQuery("pure negative BooleanQuery");
}
// optimize 1-clause queries
if (clauses.size() == 1) {
BooleanClause c = clauses.get(0);
@ -283,8 +288,6 @@ public class BooleanQuery extends Query implements Iterable<BooleanClause> {
// no scoring clauses, so return a score of 0
return new BoostQuery(new ConstantScoreQuery(query), 0);
case MUST_NOT:
// no positive clauses
return new MatchNoDocsQuery("pure negative BooleanQuery");
default:
throw new AssertionError();
}
@ -539,8 +542,7 @@ public class BooleanQuery extends Query implements Iterable<BooleanClause> {
builder.setMinimumNumberShouldMatch(minimumNumberShouldMatch);
boolean actuallyRewritten = false;
for (BooleanClause clause : clauses) {
if (clause.occur() == Occur.SHOULD && clause.query() instanceof BooleanQuery) {
BooleanQuery innerQuery = (BooleanQuery) clause.query();
if (clause.occur() == Occur.SHOULD && clause.query() instanceof BooleanQuery innerQuery) {
if (innerQuery.isPureDisjunction()) {
actuallyRewritten = true;
for (BooleanClause innerClause : innerQuery.clauses()) {
@ -558,6 +560,46 @@ public class BooleanQuery extends Query implements Iterable<BooleanClause> {
}
}
// Inline required / prohibited clauses. This helps run filtered conjunctive queries more
// efficiently by providing all clauses to the block-max AND scorer.
{
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.setMinimumNumberShouldMatch(minimumNumberShouldMatch);
boolean actuallyRewritten = false;
for (BooleanClause outerClause : clauses) {
if (outerClause.isRequired() && outerClause.query() instanceof BooleanQuery innerQuery) {
// Inlining prohibited clauses is not legal if the query is a pure negation, since pure
// negations have no matches. It works because the inner BooleanQuery would have first
// rewritten to a MatchNoDocsQuery if it only had prohibited clauses.
assert innerQuery.getClauses(Occur.MUST_NOT).size() != innerQuery.clauses().size();
if (innerQuery.getMinimumNumberShouldMatch() == 0
&& innerQuery.getClauses(Occur.SHOULD).isEmpty()) {
actuallyRewritten = true;
for (BooleanClause innerClause : innerQuery) {
Occur innerOccur = innerClause.occur();
if (innerOccur == Occur.FILTER
|| innerOccur == Occur.MUST_NOT
|| outerClause.occur() == Occur.MUST) {
builder.add(innerClause);
} else {
assert outerClause.occur() == Occur.FILTER && innerOccur == Occur.MUST;
// In this case we need to change the occur of the inner query from MUST to FILTER.
builder.add(innerClause.query(), Occur.FILTER);
}
}
} else {
builder.add(outerClause);
}
} else {
builder.add(outerClause);
}
}
if (actuallyRewritten) {
return builder.build();
}
}
// SHOULD clause count less than or equal to minimumNumberShouldMatch
// Important(this can only be processed after nested clauses have been flattened)
{

View File

@ -333,10 +333,15 @@ final class BooleanScorerSupplier extends ScorerSupplier {
requiredScoring.add(ss.get(leadCost));
}
if (scoreMode == ScoreMode.TOP_SCORES
&& requiredNoScoring.isEmpty()
&& requiredScoring.size() > 1
// Only specialize top-level conjunctions for clauses that don't have a two-phase iterator.
&& requiredNoScoring.stream().map(Scorer::twoPhaseIterator).allMatch(Objects::isNull)
&& requiredScoring.stream().map(Scorer::twoPhaseIterator).allMatch(Objects::isNull)) {
// Turn all filters into scoring clauses with a score of zero, so that
// BlockMaxConjunctionBulkScorer is applicable.
for (Scorer filter : requiredNoScoring) {
requiredScoring.add(new ConstantScoreScorer(0f, ScoreMode.COMPLETE, filter.iterator()));
}
return new BlockMaxConjunctionBulkScorer(maxDoc, requiredScoring);
}
if (scoreMode != ScoreMode.TOP_SCORES

View File

@ -330,8 +330,8 @@ public class TestBooleanRewrites extends LuceneTestCase {
int depth = TestUtil.nextInt(random(), 10, 30);
TestRewriteQuery rewriteQueryExpected = new TestRewriteQuery();
TestRewriteQuery rewriteQuery = new TestRewriteQuery();
Query expectedQuery =
new BooleanQuery.Builder().add(rewriteQueryExpected, Occur.FILTER).build();
BooleanQuery.Builder expectedQueryBuilder =
new BooleanQuery.Builder().add(rewriteQueryExpected, Occur.FILTER);
Query deepBuilder =
new BooleanQuery.Builder()
.add(rewriteQuery, Occur.SHOULD)
@ -345,21 +345,19 @@ public class TestBooleanRewrites extends LuceneTestCase {
.add(tq, Occur.SHOULD)
.add(deepBuilder, Occur.SHOULD);
deepBuilder = bq.build();
BooleanQuery.Builder expectedBq = new BooleanQuery.Builder().add(tq, Occur.FILTER);
expectedQueryBuilder.add(tq, Occur.FILTER);
if (i == depth) {
expectedBq.add(rewriteQuery, Occur.FILTER);
} else {
expectedBq.add(expectedQuery, Occur.FILTER);
expectedQueryBuilder.add(rewriteQuery, Occur.FILTER);
}
expectedQuery = expectedBq.build();
}
BooleanQuery bq = new BooleanQuery.Builder().add(deepBuilder, Occur.FILTER).build();
expectedQuery = new BoostQuery(new ConstantScoreQuery(expectedQuery), 0.0f);
Query expectedQuery =
new BoostQuery(new ConstantScoreQuery(expectedQueryBuilder.build()), 0.0f);
Query rewritten = searcher.rewrite(bq);
assertEquals(expectedQuery, rewritten);
// the SHOULD clauses cause more rewrites because they incrementally change to `MUST` and then
// `FILTER`
assertEquals("Depth=" + depth, depth + 1, rewriteQuery.numRewrites);
// `FILTER`, plus the flattening of required clauses
assertEquals("Depth=" + depth, depth * 2, rewriteQuery.numRewrites);
}
public void testDeeplyNestedBooleanRewrite() throws IOException {
@ -369,27 +367,26 @@ public class TestBooleanRewrites extends LuceneTestCase {
int depth = TestUtil.nextInt(random(), 10, 30);
TestRewriteQuery rewriteQueryExpected = new TestRewriteQuery();
TestRewriteQuery rewriteQuery = new TestRewriteQuery();
Query expectedQuery =
new BooleanQuery.Builder().add(rewriteQueryExpected, Occur.FILTER).build();
BooleanQuery.Builder expectedQueryBuilder =
new BooleanQuery.Builder().add(rewriteQueryExpected, Occur.FILTER);
Query deepBuilder = new BooleanQuery.Builder().add(rewriteQuery, Occur.MUST).build();
for (int i = depth; i > 0; i--) {
TermQuery tq = termQueryFunction.apply(i);
BooleanQuery.Builder bq =
new BooleanQuery.Builder().add(tq, Occur.MUST).add(deepBuilder, Occur.MUST);
deepBuilder = bq.build();
BooleanQuery.Builder expectedBq = new BooleanQuery.Builder().add(tq, Occur.FILTER);
expectedQueryBuilder.add(tq, Occur.FILTER);
if (i == depth) {
expectedBq.add(rewriteQuery, Occur.FILTER);
} else {
expectedBq.add(expectedQuery, Occur.FILTER);
expectedQueryBuilder.add(rewriteQuery, Occur.FILTER);
}
expectedQuery = expectedBq.build();
}
BooleanQuery bq = new BooleanQuery.Builder().add(deepBuilder, Occur.FILTER).build();
expectedQuery = new BoostQuery(new ConstantScoreQuery(expectedQuery), 0.0f);
Query expectedQuery =
new BoostQuery(new ConstantScoreQuery(expectedQueryBuilder.build()), 0.0f);
Query rewritten = searcher.rewrite(bq);
assertEquals(expectedQuery, rewritten);
assertEquals("Depth=" + depth, 1, rewriteQuery.numRewrites);
// `depth` rewrites because of the flattening
assertEquals("Depth=" + depth, depth, rewriteQuery.numRewrites);
}
public void testRemoveMatchAllFilter() throws IOException {
@ -691,6 +688,110 @@ public class TestBooleanRewrites extends LuceneTestCase {
assertSame(query, searcher.rewrite(query));
}
public void testFlattenInnerConjunctions() throws IOException {
IndexSearcher searcher = newSearcher(new MultiReader());
Query inner =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.MUST)
.add(new TermQuery(new Term("foo", "quux")), Occur.MUST)
.build();
Query query =
new BooleanQuery.Builder()
.add(inner, Occur.MUST)
.add(new TermQuery(new Term("foo", "baz")), Occur.FILTER)
.build();
Query expectedRewritten =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.MUST)
.add(new TermQuery(new Term("foo", "quux")), Occur.MUST)
.add(new TermQuery(new Term("foo", "baz")), Occur.FILTER)
.build();
assertEquals(expectedRewritten, searcher.rewrite(query));
query =
new BooleanQuery.Builder()
.setMinimumNumberShouldMatch(0)
.add(inner, Occur.MUST)
.add(new TermQuery(new Term("foo", "baz")), Occur.SHOULD)
.build();
expectedRewritten =
new BooleanQuery.Builder()
.setMinimumNumberShouldMatch(0)
.add(new TermQuery(new Term("foo", "bar")), Occur.MUST)
.add(new TermQuery(new Term("foo", "quux")), Occur.MUST)
.add(new TermQuery(new Term("foo", "baz")), Occur.SHOULD)
.build();
assertEquals(expectedRewritten, searcher.rewrite(query));
query =
new BooleanQuery.Builder()
.add(inner, Occur.MUST)
.add(new TermQuery(new Term("foo", "baz")), Occur.MUST_NOT)
.build();
expectedRewritten =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.MUST)
.add(new TermQuery(new Term("foo", "quux")), Occur.MUST)
.add(new TermQuery(new Term("foo", "baz")), Occur.MUST_NOT)
.build();
assertEquals(expectedRewritten, searcher.rewrite(query));
inner =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.MUST)
.add(new TermQuery(new Term("foo", "quux")), Occur.FILTER)
.build();
query =
new BooleanQuery.Builder()
.add(inner, Occur.MUST)
.add(new TermQuery(new Term("foo", "baz")), Occur.MUST)
.build();
expectedRewritten =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.MUST)
.add(new TermQuery(new Term("foo", "quux")), Occur.FILTER)
.add(new TermQuery(new Term("foo", "baz")), Occur.MUST)
.build();
assertEquals(expectedRewritten, searcher.rewrite(query));
inner =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.MUST)
.add(new TermQuery(new Term("foo", "quux")), Occur.FILTER)
.build();
query =
new BooleanQuery.Builder()
.add(inner, Occur.FILTER)
.add(new TermQuery(new Term("foo", "baz")), Occur.MUST)
.build();
expectedRewritten =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.FILTER)
.add(new TermQuery(new Term("foo", "quux")), Occur.FILTER)
.add(new TermQuery(new Term("foo", "baz")), Occur.MUST)
.build();
assertEquals(expectedRewritten, searcher.rewrite(query));
inner =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.MUST)
.add(new TermQuery(new Term("foo", "quux")), Occur.MUST_NOT)
.build();
query =
new BooleanQuery.Builder()
.add(inner, Occur.FILTER)
.add(new TermQuery(new Term("foo", "baz")), Occur.MUST)
.build();
expectedRewritten =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "bar")), Occur.FILTER)
.add(new TermQuery(new Term("foo", "quux")), Occur.MUST_NOT)
.add(new TermQuery(new Term("foo", "baz")), Occur.MUST)
.build();
assertEquals(expectedRewritten, searcher.rewrite(query));
}
public void testDiscardShouldClauses() throws IOException {
IndexSearcher searcher = newSearcher(new MultiReader());