TermInSetQuery optimization when all docs in a field match a term (#11828)

This commit is contained in:
Greg Miller 2022-09-29 06:59:59 -07:00 committed by GitHub
parent 367cd2ea95
commit 44b4602776
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 26 additions and 3 deletions

View File

@ -132,6 +132,10 @@ Optimizations
* GITHUB#11803: DrillSidewaysScorer has improved to leverage "advance" instead of "next" where
possible, and splits out first and second phase checks to delay match confirmation. (Greg Miller)
* GITHUB#11828: Tweak TermInSetQuery "dense" optimization to only require all terms present in a
given field to match a term (rather than all docs in a segment). This is consistent with
MultiTermQueryConstantScoreWrapper. (Greg Miller)
Other
---------------------
* LUCENE-10423: Remove usages of System.currentTimeMillis() from tests. (Marios Trivyzas)

View File

@ -266,6 +266,7 @@ public class TermInSetQuery extends Query implements Accountable {
if (terms == null) {
return null;
}
final int fieldDocCount = terms.getDocCount();
TermsEnum termsEnum = terms.iterator();
PostingsEnum docs = null;
TermIterator iterator = termData.iterator();
@ -281,8 +282,18 @@ public class TermInSetQuery extends Query implements Accountable {
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
assert field.equals(iterator.field());
if (termsEnum.seekExact(term)) {
if (reader.maxDoc() == termsEnum.docFreq()) {
return new WeightOrDocIdSet(DocIdSet.all(reader.maxDoc()));
// If a term contains all docs with a value for the specified field (likely rare),
// we can discard the other terms and just use the dense term's postings:
int docFreq = termsEnum.docFreq();
if (fieldDocCount == docFreq) {
TermStates termStates = new TermStates(searcher.getTopReaderContext());
termStates.register(
termsEnum.termState(), context.ord, docFreq, termsEnum.totalTermFreq());
Query q =
new ConstantScoreQuery(
new TermQuery(new Term(field, termsEnum.term()), termStates));
Weight weight = searcher.rewrite(q).createWeight(searcher, scoreMode, score());
return new WeightOrDocIdSet(weight);
}
if (matchingTerms == null) {
@ -304,6 +315,7 @@ public class TermInSetQuery extends Query implements Accountable {
}
}
}
if (matchingTerms != null) {
assert builder == null;
BooleanQuery.Builder bq = new BooleanQuery.Builder();

View File

@ -51,7 +51,7 @@ import org.apache.lucene.util.automaton.ByteRunAutomaton;
public class TestTermInSetQuery extends LuceneTestCase {
public void testAllDocsTerm() throws IOException {
public void testAllDocsInFieldTerm() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
String field = "f";
@ -69,6 +69,7 @@ public class TestTermInSetQuery extends LuceneTestCase {
otherTerms[idx++] = term;
}
// Every doc with a value for `field` will contain `denseTerm`:
int numDocs = 10 * otherTerms.length;
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
@ -78,6 +79,12 @@ public class TestTermInSetQuery extends LuceneTestCase {
iw.addDocument(doc);
}
// Make sure there are some docs in the index that don't contain a value for the field at all:
for (int i = 0; i < 100; i++) {
Document doc = new Document();
doc.add(new StringField("foo", "bar", Store.NO));
}
IndexReader reader = iw.getReader();
IndexSearcher searcher = newSearcher(reader);
iw.close();