Optimize counts on two clause term disjunctions (#13036)

Calculate count(clause1 OR clause2) as count(clause1)
+ count(clause2) - count(clause1 AND clause2)
This commit is contained in:
Johannes Fredén 2024-02-01 13:48:53 +01:00 committed by GitHub
parent 3d47a0d5c2
commit b537e1da27
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 193 additions and 0 deletions

View File

@ -232,6 +232,8 @@ Optimizations
* GITHUB#13052: Avoid set.removeAll(list) O(n^2) performance trap in the UpgradeIndexMergePolicy (Dmitry Cherniachenko)
* GITHUB#:13036 Optimize counts on two clause term disjunctions. (Adrien Grand, Johannes Fredén)
Bug Fixes
---------------------
* GITHUB#12866: Prevent extra similarity computation for single-level HNSW graphs. (Kaival Parikh)

View File

@ -30,6 +30,7 @@ import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.function.Predicate;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.search.BooleanClause.Occur;
/**
@ -179,6 +180,38 @@ public class BooleanQuery extends Query implements Iterable<BooleanClause> {
return clauses.size() == getClauses(Occur.SHOULD).size() && minimumNumberShouldMatch <= 1;
}
/** Whether this query is a two clause disjunction with two term query clauses. */
boolean isTwoClausePureDisjunctionWithTerms() {
return clauses.size() == 2
&& isPureDisjunction()
&& clauses.get(0).getQuery() instanceof TermQuery
&& clauses.get(1).getQuery() instanceof TermQuery;
}
/**
* Rewrite a single two clause disjunction query with terms to two term queries and a conjunction
* query using the inclusionexclusion principle.
*/
Query[] rewriteTwoClauseDisjunctionWithTermsForCount(IndexSearcher indexSearcher)
throws IOException {
BooleanQuery.Builder newQuery = new BooleanQuery.Builder();
Query[] queries = new Query[3];
for (int i = 0; i < clauses.size(); i++) {
TermQuery termQuery = (TermQuery) clauses.get(i).getQuery();
// Optimization will count term query several times so use cache to avoid multiple terms
// dictionary lookups
if (termQuery.getTermStates() == null) {
termQuery =
new TermQuery(
termQuery.getTerm(), TermStates.build(indexSearcher, termQuery.getTerm(), false));
}
newQuery.add(termQuery, Occur.MUST);
queries[i] = termQuery;
}
queries[2] = newQuery.build();
return queries;
}
/**
* Returns an iterator on the clauses in this query. It implements the {@link Iterable} interface
* to make it possible to do:

View File

@ -420,6 +420,27 @@ public class IndexSearcher {
* possible.
*/
public int count(Query query) throws IOException {
// Rewrite query before optimization check
query = rewrite(new ConstantScoreQuery(query));
if (query instanceof ConstantScoreQuery csq) {
query = csq.getQuery();
}
// Check if two clause disjunction optimization applies
if (query instanceof BooleanQuery booleanQuery
&& this.reader.hasDeletions() == false
&& booleanQuery.isTwoClausePureDisjunctionWithTerms()) {
Query[] queries = booleanQuery.rewriteTwoClauseDisjunctionWithTermsForCount(this);
int countTerm1 = count(queries[0]);
int countTerm2 = count(queries[1]);
if (countTerm1 == 0 || countTerm2 == 0) {
return Math.max(countTerm1, countTerm2);
// Only apply optimization if the intersection is significantly smaller than the union
} else if ((double) Math.min(countTerm1, countTerm2) / Math.max(countTerm1, countTerm2)
< 0.1) {
return countTerm1 + countTerm2 - count(queries[2]);
}
}
return search(new ConstantScoreQuery(query), new TotalHitCountCollectorManager());
}

View File

@ -962,6 +962,143 @@ public class TestBooleanQuery extends LuceneTestCase {
dir.close();
}
public void testTwoClauseTermDisjunctionCountOptimization() throws Exception {
int largerTermCount = RandomNumbers.randomIntBetween(random(), 11, 100);
int smallerTermCount = RandomNumbers.randomIntBetween(random(), 1, (largerTermCount - 1) / 10);
List<String[]> docContent = new ArrayList<>(largerTermCount + smallerTermCount);
for (int i = 0; i < largerTermCount; i++) {
docContent.add(new String[] {"large"});
}
for (int i = 0; i < smallerTermCount; i++) {
docContent.add(new String[] {"small", "also small"});
}
try (Directory dir = newDirectory()) {
try (IndexWriter w =
new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()))) {
for (String[] values : docContent) {
Document doc = new Document();
for (String value : values) {
doc.add(new StringField("foo", value, Field.Store.NO));
}
w.addDocument(doc);
}
w.forceMerge(1);
}
try (IndexReader reader = DirectoryReader.open(dir)) {
final int[] countInvocations = new int[] {0};
IndexSearcher countingIndexSearcher =
new IndexSearcher(reader) {
@Override
public int count(Query query) throws IOException {
countInvocations[0]++;
return super.count(query);
}
};
{
// Test no matches in either term
countInvocations[0] = 0;
BooleanQuery query =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "no match")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("foo", "also no match")), BooleanClause.Occur.SHOULD)
.build();
assertEquals(0, countingIndexSearcher.count(query));
assertEquals(3, countInvocations[0]);
}
{
// Test match no match in first term
countInvocations[0] = 0;
BooleanQuery query =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "no match")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("foo", "small")), BooleanClause.Occur.SHOULD)
.build();
assertEquals(smallerTermCount, countingIndexSearcher.count(query));
assertEquals(3, countInvocations[0]);
}
{
// Test match no match in second term
countInvocations[0] = 0;
BooleanQuery query =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "small")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("foo", "no match")), BooleanClause.Occur.SHOULD)
.build();
assertEquals(smallerTermCount, countingIndexSearcher.count(query));
assertEquals(3, countInvocations[0]);
}
{
// Test match in both terms that hits optimization threshold with small term first
countInvocations[0] = 0;
BooleanQuery query =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "small")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("foo", "large")), BooleanClause.Occur.SHOULD)
.build();
int count = countingIndexSearcher.count(query);
assertEquals(largerTermCount + smallerTermCount, count);
assertEquals(4, countInvocations[0]);
assertTrue(query.isTwoClausePureDisjunctionWithTerms());
Query[] queries =
query.rewriteTwoClauseDisjunctionWithTermsForCount(countingIndexSearcher);
assertEquals(queries.length, 3);
assertEquals(smallerTermCount, countingIndexSearcher.count(queries[0]));
assertEquals(largerTermCount, countingIndexSearcher.count(queries[1]));
}
{
// Test match in both terms that hits optimization threshold with large term first
countInvocations[0] = 0;
BooleanQuery query =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "large")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("foo", "small")), BooleanClause.Occur.SHOULD)
.build();
int count = countingIndexSearcher.count(query);
assertEquals(largerTermCount + smallerTermCount, count);
assertEquals(4, countInvocations[0]);
assertTrue(query.isTwoClausePureDisjunctionWithTerms());
Query[] queries =
query.rewriteTwoClauseDisjunctionWithTermsForCount(countingIndexSearcher);
assertEquals(queries.length, 3);
assertEquals(largerTermCount, countingIndexSearcher.count(queries[0]));
assertEquals(smallerTermCount, countingIndexSearcher.count(queries[1]));
}
{
// Test match in both terms that doesn't hit optimization threshold
countInvocations[0] = 0;
BooleanQuery query =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo", "small")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("foo", "also small")), BooleanClause.Occur.SHOULD)
.build();
int count = countingIndexSearcher.count(query);
assertEquals(smallerTermCount, count);
assertEquals(3, countInvocations[0]);
}
}
}
}
// test BlockMaxMaxscoreScorer
public void testDisjunctionTwoClausesMatchesCountAndScore() throws Exception {
List<String[]> docContent =