TermInSetQuery optimization when all docs in a field match a term (#11828)

2025-02-07 02:28:49 +00:00 · 2022-09-29 06:59:59 -07:00 · 2022-09-29 06:59:59 -07:00 · 44b4602776
commit 44b4602776
parent 367cd2ea95
3 changed files with 26 additions and 3 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -132,6 +132,10 @@ Optimizations
 * GITHUB#11803: DrillSidewaysScorer has improved to leverage "advance" instead of "next" where
  possible, and splits out first and second phase checks to delay match confirmation. (Greg Miller)

+* GITHUB#11828: Tweak TermInSetQuery "dense" optimization to only require all terms present in a
+  given field to match a term (rather than all docs in a segment). This is consistent with
+  MultiTermQueryConstantScoreWrapper. (Greg Miller)
+
 Other
 ---------------------
 * LUCENE-10423: Remove usages of System.currentTimeMillis() from tests. (Marios Trivyzas)
--- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
@ -266,6 +266,7 @@ public class TermInSetQuery extends Query implements Accountable {
        if (terms == null) {
          return null;
        }
+        final int fieldDocCount = terms.getDocCount();
        TermsEnum termsEnum = terms.iterator();
        PostingsEnum docs = null;
        TermIterator iterator = termData.iterator();
@ -281,8 +282,18 @@ public class TermInSetQuery extends Query implements Accountable {
        for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
          assert field.equals(iterator.field());
          if (termsEnum.seekExact(term)) {
-            if (reader.maxDoc() == termsEnum.docFreq()) {
-              return new WeightOrDocIdSet(DocIdSet.all(reader.maxDoc()));
+            // If a term contains all docs with a value for the specified field (likely rare),
+            // we can discard the other terms and just use the dense term's postings:
+            int docFreq = termsEnum.docFreq();
+            if (fieldDocCount == docFreq) {
+              TermStates termStates = new TermStates(searcher.getTopReaderContext());
+              termStates.register(
+                  termsEnum.termState(), context.ord, docFreq, termsEnum.totalTermFreq());
+              Query q =
+                  new ConstantScoreQuery(
+                      new TermQuery(new Term(field, termsEnum.term()), termStates));
+              Weight weight = searcher.rewrite(q).createWeight(searcher, scoreMode, score());
+              return new WeightOrDocIdSet(weight);
            }

            if (matchingTerms == null) {
@ -304,6 +315,7 @@ public class TermInSetQuery extends Query implements Accountable {
            }
          }
        }
+
        if (matchingTerms != null) {
          assert builder == null;
          BooleanQuery.Builder bq = new BooleanQuery.Builder();
--- a/lucene/core/src/test/org/apache/lucene/search/TestTermInSetQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestTermInSetQuery.java
@ -51,7 +51,7 @@ import org.apache.lucene.util.automaton.ByteRunAutomaton;

 public class TestTermInSetQuery extends LuceneTestCase {

-  public void testAllDocsTerm() throws IOException {
+  public void testAllDocsInFieldTerm() throws IOException {
    Directory dir = newDirectory();
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
    String field = "f";
@ -69,6 +69,7 @@ public class TestTermInSetQuery extends LuceneTestCase {
      otherTerms[idx++] = term;
    }

+    // Every doc with a value for `field` will contain `denseTerm`:
    int numDocs = 10 * otherTerms.length;
    for (int i = 0; i < numDocs; i++) {
      Document doc = new Document();
@ -78,6 +79,12 @@ public class TestTermInSetQuery extends LuceneTestCase {
      iw.addDocument(doc);
    }

+    // Make sure there are some docs in the index that don't contain a value for the field at all:
+    for (int i = 0; i < 100; i++) {
+      Document doc = new Document();
+      doc.add(new StringField("foo", "bar", Store.NO));
+    }
+
    IndexReader reader = iw.getReader();
    IndexSearcher searcher = newSearcher(reader);
    iw.close();