mirror of https://github.com/apache/lucene.git
Optimize counts on two clause term disjunctions (#13036)
Calculate count(clause1 OR clause2) as count(clause1) + count(clause2) - count(clause1 AND clause2)
This commit is contained in:
parent
3d47a0d5c2
commit
b537e1da27
|
@ -232,6 +232,8 @@ Optimizations
|
|||
|
||||
* GITHUB#13052: Avoid set.removeAll(list) O(n^2) performance trap in the UpgradeIndexMergePolicy (Dmitry Cherniachenko)
|
||||
|
||||
* GITHUB#:13036 Optimize counts on two clause term disjunctions. (Adrien Grand, Johannes Fredén)
|
||||
|
||||
Bug Fixes
|
||||
---------------------
|
||||
* GITHUB#12866: Prevent extra similarity computation for single-level HNSW graphs. (Kaival Parikh)
|
||||
|
|
|
@ -30,6 +30,7 @@ import java.util.Map;
|
|||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.function.Predicate;
|
||||
import org.apache.lucene.index.TermStates;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
|
||||
/**
|
||||
|
@ -179,6 +180,38 @@ public class BooleanQuery extends Query implements Iterable<BooleanClause> {
|
|||
return clauses.size() == getClauses(Occur.SHOULD).size() && minimumNumberShouldMatch <= 1;
|
||||
}
|
||||
|
||||
/** Whether this query is a two clause disjunction with two term query clauses. */
|
||||
boolean isTwoClausePureDisjunctionWithTerms() {
|
||||
return clauses.size() == 2
|
||||
&& isPureDisjunction()
|
||||
&& clauses.get(0).getQuery() instanceof TermQuery
|
||||
&& clauses.get(1).getQuery() instanceof TermQuery;
|
||||
}
|
||||
|
||||
/**
|
||||
* Rewrite a single two clause disjunction query with terms to two term queries and a conjunction
|
||||
* query using the inclusion–exclusion principle.
|
||||
*/
|
||||
Query[] rewriteTwoClauseDisjunctionWithTermsForCount(IndexSearcher indexSearcher)
|
||||
throws IOException {
|
||||
BooleanQuery.Builder newQuery = new BooleanQuery.Builder();
|
||||
Query[] queries = new Query[3];
|
||||
for (int i = 0; i < clauses.size(); i++) {
|
||||
TermQuery termQuery = (TermQuery) clauses.get(i).getQuery();
|
||||
// Optimization will count term query several times so use cache to avoid multiple terms
|
||||
// dictionary lookups
|
||||
if (termQuery.getTermStates() == null) {
|
||||
termQuery =
|
||||
new TermQuery(
|
||||
termQuery.getTerm(), TermStates.build(indexSearcher, termQuery.getTerm(), false));
|
||||
}
|
||||
newQuery.add(termQuery, Occur.MUST);
|
||||
queries[i] = termQuery;
|
||||
}
|
||||
queries[2] = newQuery.build();
|
||||
return queries;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an iterator on the clauses in this query. It implements the {@link Iterable} interface
|
||||
* to make it possible to do:
|
||||
|
|
|
@ -420,6 +420,27 @@ public class IndexSearcher {
|
|||
* possible.
|
||||
*/
|
||||
public int count(Query query) throws IOException {
|
||||
// Rewrite query before optimization check
|
||||
query = rewrite(new ConstantScoreQuery(query));
|
||||
if (query instanceof ConstantScoreQuery csq) {
|
||||
query = csq.getQuery();
|
||||
}
|
||||
|
||||
// Check if two clause disjunction optimization applies
|
||||
if (query instanceof BooleanQuery booleanQuery
|
||||
&& this.reader.hasDeletions() == false
|
||||
&& booleanQuery.isTwoClausePureDisjunctionWithTerms()) {
|
||||
Query[] queries = booleanQuery.rewriteTwoClauseDisjunctionWithTermsForCount(this);
|
||||
int countTerm1 = count(queries[0]);
|
||||
int countTerm2 = count(queries[1]);
|
||||
if (countTerm1 == 0 || countTerm2 == 0) {
|
||||
return Math.max(countTerm1, countTerm2);
|
||||
// Only apply optimization if the intersection is significantly smaller than the union
|
||||
} else if ((double) Math.min(countTerm1, countTerm2) / Math.max(countTerm1, countTerm2)
|
||||
< 0.1) {
|
||||
return countTerm1 + countTerm2 - count(queries[2]);
|
||||
}
|
||||
}
|
||||
return search(new ConstantScoreQuery(query), new TotalHitCountCollectorManager());
|
||||
}
|
||||
|
||||
|
|
|
@ -962,6 +962,143 @@ public class TestBooleanQuery extends LuceneTestCase {
|
|||
dir.close();
|
||||
}
|
||||
|
||||
public void testTwoClauseTermDisjunctionCountOptimization() throws Exception {
|
||||
int largerTermCount = RandomNumbers.randomIntBetween(random(), 11, 100);
|
||||
int smallerTermCount = RandomNumbers.randomIntBetween(random(), 1, (largerTermCount - 1) / 10);
|
||||
|
||||
List<String[]> docContent = new ArrayList<>(largerTermCount + smallerTermCount);
|
||||
|
||||
for (int i = 0; i < largerTermCount; i++) {
|
||||
docContent.add(new String[] {"large"});
|
||||
}
|
||||
|
||||
for (int i = 0; i < smallerTermCount; i++) {
|
||||
docContent.add(new String[] {"small", "also small"});
|
||||
}
|
||||
|
||||
try (Directory dir = newDirectory()) {
|
||||
try (IndexWriter w =
|
||||
new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()))) {
|
||||
|
||||
for (String[] values : docContent) {
|
||||
Document doc = new Document();
|
||||
for (String value : values) {
|
||||
doc.add(new StringField("foo", value, Field.Store.NO));
|
||||
}
|
||||
w.addDocument(doc);
|
||||
}
|
||||
w.forceMerge(1);
|
||||
}
|
||||
|
||||
try (IndexReader reader = DirectoryReader.open(dir)) {
|
||||
final int[] countInvocations = new int[] {0};
|
||||
IndexSearcher countingIndexSearcher =
|
||||
new IndexSearcher(reader) {
|
||||
@Override
|
||||
public int count(Query query) throws IOException {
|
||||
countInvocations[0]++;
|
||||
return super.count(query);
|
||||
}
|
||||
};
|
||||
|
||||
{
|
||||
// Test no matches in either term
|
||||
countInvocations[0] = 0;
|
||||
BooleanQuery query =
|
||||
new BooleanQuery.Builder()
|
||||
.add(new TermQuery(new Term("foo", "no match")), BooleanClause.Occur.SHOULD)
|
||||
.add(new TermQuery(new Term("foo", "also no match")), BooleanClause.Occur.SHOULD)
|
||||
.build();
|
||||
|
||||
assertEquals(0, countingIndexSearcher.count(query));
|
||||
assertEquals(3, countInvocations[0]);
|
||||
}
|
||||
{
|
||||
// Test match no match in first term
|
||||
countInvocations[0] = 0;
|
||||
BooleanQuery query =
|
||||
new BooleanQuery.Builder()
|
||||
.add(new TermQuery(new Term("foo", "no match")), BooleanClause.Occur.SHOULD)
|
||||
.add(new TermQuery(new Term("foo", "small")), BooleanClause.Occur.SHOULD)
|
||||
.build();
|
||||
|
||||
assertEquals(smallerTermCount, countingIndexSearcher.count(query));
|
||||
assertEquals(3, countInvocations[0]);
|
||||
}
|
||||
{
|
||||
// Test match no match in second term
|
||||
countInvocations[0] = 0;
|
||||
BooleanQuery query =
|
||||
new BooleanQuery.Builder()
|
||||
.add(new TermQuery(new Term("foo", "small")), BooleanClause.Occur.SHOULD)
|
||||
.add(new TermQuery(new Term("foo", "no match")), BooleanClause.Occur.SHOULD)
|
||||
.build();
|
||||
|
||||
assertEquals(smallerTermCount, countingIndexSearcher.count(query));
|
||||
assertEquals(3, countInvocations[0]);
|
||||
}
|
||||
{
|
||||
// Test match in both terms that hits optimization threshold with small term first
|
||||
countInvocations[0] = 0;
|
||||
|
||||
BooleanQuery query =
|
||||
new BooleanQuery.Builder()
|
||||
.add(new TermQuery(new Term("foo", "small")), BooleanClause.Occur.SHOULD)
|
||||
.add(new TermQuery(new Term("foo", "large")), BooleanClause.Occur.SHOULD)
|
||||
.build();
|
||||
|
||||
int count = countingIndexSearcher.count(query);
|
||||
|
||||
assertEquals(largerTermCount + smallerTermCount, count);
|
||||
assertEquals(4, countInvocations[0]);
|
||||
|
||||
assertTrue(query.isTwoClausePureDisjunctionWithTerms());
|
||||
Query[] queries =
|
||||
query.rewriteTwoClauseDisjunctionWithTermsForCount(countingIndexSearcher);
|
||||
assertEquals(queries.length, 3);
|
||||
assertEquals(smallerTermCount, countingIndexSearcher.count(queries[0]));
|
||||
assertEquals(largerTermCount, countingIndexSearcher.count(queries[1]));
|
||||
}
|
||||
{
|
||||
// Test match in both terms that hits optimization threshold with large term first
|
||||
countInvocations[0] = 0;
|
||||
|
||||
BooleanQuery query =
|
||||
new BooleanQuery.Builder()
|
||||
.add(new TermQuery(new Term("foo", "large")), BooleanClause.Occur.SHOULD)
|
||||
.add(new TermQuery(new Term("foo", "small")), BooleanClause.Occur.SHOULD)
|
||||
.build();
|
||||
|
||||
int count = countingIndexSearcher.count(query);
|
||||
|
||||
assertEquals(largerTermCount + smallerTermCount, count);
|
||||
assertEquals(4, countInvocations[0]);
|
||||
|
||||
assertTrue(query.isTwoClausePureDisjunctionWithTerms());
|
||||
Query[] queries =
|
||||
query.rewriteTwoClauseDisjunctionWithTermsForCount(countingIndexSearcher);
|
||||
assertEquals(queries.length, 3);
|
||||
assertEquals(largerTermCount, countingIndexSearcher.count(queries[0]));
|
||||
assertEquals(smallerTermCount, countingIndexSearcher.count(queries[1]));
|
||||
}
|
||||
{
|
||||
// Test match in both terms that doesn't hit optimization threshold
|
||||
countInvocations[0] = 0;
|
||||
BooleanQuery query =
|
||||
new BooleanQuery.Builder()
|
||||
.add(new TermQuery(new Term("foo", "small")), BooleanClause.Occur.SHOULD)
|
||||
.add(new TermQuery(new Term("foo", "also small")), BooleanClause.Occur.SHOULD)
|
||||
.build();
|
||||
|
||||
int count = countingIndexSearcher.count(query);
|
||||
|
||||
assertEquals(smallerTermCount, count);
|
||||
assertEquals(3, countInvocations[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// test BlockMaxMaxscoreScorer
|
||||
public void testDisjunctionTwoClausesMatchesCountAndScore() throws Exception {
|
||||
List<String[]> docContent =
|
||||
|
|
Loading…
Reference in New Issue