LUCENE-9349: TermInSetQuery should use consumeMatchingTerms in visit() (#1465)

TermInSetQuery currently iterates through all its prefix-encoded terms in order to build an array to pass back to its visitor when visit() is called. This seems like a waste, particularly when the visitor is not actually consuming the terms (for example, when doing a clause-count check before executing a search). This commit changes TermInSetQuery to use consumeTermsMatching(), and also changes the signature of this method so that it takes a BytesRunAutomaton supplier to allow for lazy instantiation. In addition, IndexSearcher's clause count check wasn't counting leaves that called consumeTermsMatching().
2020-04-29 10:19:05 +01:00 · 2020-04-29 10:19:05 +01:00 · 267d70b66b
parent 59a8e83520
commit 267d70b66b
7 changed files with 85 additions and 12 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -142,6 +142,11 @@ API Changes
  interface that offers the basic methods to acquire pending merges, run the merge and do accounting 
  around it. (Simon Willnauer)

+* LUCENE-9349: QueryVisitor.consumeTermsMatching() now takes a
+  Supplier<ByteRunAutomaton> to enable queries that build large automata to
+  provide them lazily.  TermsInSetQuery switches to using this method
+  to report matching terms. (Alan Woodward)
+
 New Features
 ---------------------
 (No changes)
--- a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
+++ b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
@ -31,6 +31,7 @@ import java.util.concurrent.Executor;
 import java.util.concurrent.Future;
 import java.util.concurrent.FutureTask;
 import java.util.concurrent.ThreadPoolExecutor;
+import java.util.function.Supplier;

 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.DirectoryReader;
@ -47,6 +48,7 @@ import org.apache.lucene.search.similarities.Similarity;
 import org.apache.lucene.store.NIOFSDirectory;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.ThreadInterruptedException;
+import org.apache.lucene.util.automaton.ByteRunAutomaton;

 /** Implements search over a single IndexReader.
 *
@ -790,6 +792,14 @@ public class IndexSearcher {
        }
        ++numClauses;
      }
+
+      @Override
+      public void consumeTermsMatching(Query query, String field, Supplier<ByteRunAutomaton> automaton) {
+        if (numClauses > maxClauseCount) {
+          throw new TooManyClauses();
+        }
+        ++numClauses;
+      }
    };
  }

--- a/lucene/core/src/java/org/apache/lucene/search/QueryVisitor.java
+++ b/lucene/core/src/java/org/apache/lucene/search/QueryVisitor.java
@ -19,6 +19,7 @@ package org.apache.lucene.search;

 import java.util.Arrays;
 import java.util.Set;
+import java.util.function.Supplier;

 import org.apache.lucene.index.Term;
 import org.apache.lucene.util.automaton.ByteRunAutomaton;
@ -43,11 +44,11 @@ public abstract class QueryVisitor {
   *
   * @param query     the leaf query
   * @param field     the field queried against
-   * @param automaton an automaton defining which terms match
+   * @param automaton a supplier for an automaton defining which terms match
   *
   * @lucene.experimental
   */
-  public void consumeTermsMatching(Query query, String field, ByteRunAutomaton automaton) {
+  public void consumeTermsMatching(Query query, String field, Supplier<ByteRunAutomaton> automaton) {
    visitLeaf(query); // default impl for backward compatibility
  }

--- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
@ -43,6 +43,11 @@ import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.DocIdSetBuilder;
 import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.ByteRunAutomaton;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
+import org.apache.lucene.util.automaton.Operations;

 /**
 * Specialization for a disjunction over many terms that behaves like a
@ -126,13 +131,22 @@ public class TermInSetQuery extends Query implements Accountable {
    if (visitor.acceptField(field) == false) {
      return;
    }
-    QueryVisitor v = visitor.getSubVisitor(Occur.SHOULD, this);
-    List<Term> terms = new ArrayList<>();
-    TermIterator iterator = termData.iterator();
-    for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
-      terms.add(new Term(field, BytesRef.deepCopyOf(term)));
+    if (termData.size() == 1) {
+      visitor.consumeTerms(this, new Term(field, termData.iterator().next()));
    }
-    v.consumeTerms(this, terms.toArray(new Term[0]));
+    if (termData.size() > 1) {
+      visitor.consumeTermsMatching(this, field, this::asByteRunAutomaton);
+    }
+  }
+
+  private ByteRunAutomaton asByteRunAutomaton() {
+    TermIterator iterator = termData.iterator();
+    List<Automaton> automata = new ArrayList<>();
+    for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
+      automata.add(Automata.makeBinary(term));
+    }
+    return new CompiledAutomaton(Operations.union(automata)).runAutomaton;
+
  }

  @Override
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
@ -354,12 +354,12 @@ public class CompiledAutomaton implements Accountable {
    if (visitor.acceptField(field)) {
      switch (type) {
        case NORMAL:
-          visitor.consumeTermsMatching(parent, field, runAutomaton);
+          visitor.consumeTermsMatching(parent, field, () -> runAutomaton);
          break;
        case NONE:
          break;
        case ALL:
-          visitor.consumeTermsMatching(parent, field, new ByteRunAutomaton(Automata.makeAnyString()));
+          visitor.consumeTermsMatching(parent, field, () -> new ByteRunAutomaton(Automata.makeAnyString()));
          break;
        case SINGLE:
          visitor.consumeTerms(parent, new Term(field, term));
--- a/lucene/core/src/test/org/apache/lucene/search/TermInSetQueryTest.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TermInSetQueryTest.java
@ -23,6 +23,7 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.function.Supplier;

 import com.carrotsearch.randomizedtesting.generators.RandomStrings;
 import org.apache.lucene.document.Document;
@ -44,6 +45,7 @@ import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.RamUsageTester;
 import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.automaton.ByteRunAutomaton;

 public class TermInSetQueryTest extends LuceneTestCase {

@ -297,4 +299,44 @@ public class TermInSetQueryTest extends LuceneTestCase {
    // cached after two uses
    assertTrue(policy.shouldCache(query));
  }
+
+  public void testVisitor() {
+    // singleton reports back to consumeTerms()
+    TermInSetQuery singleton = new TermInSetQuery("field", new BytesRef("term1"));
+    singleton.visit(new QueryVisitor() {
+      @Override
+      public void consumeTerms(Query query, Term... terms) {
+        assertEquals(1, terms.length);
+        assertEquals(new Term("field", new BytesRef("term1")), terms[0]);
+      }
+
+      @Override
+      public void consumeTermsMatching(Query query, String field, Supplier<ByteRunAutomaton> automaton) {
+        fail("Singleton TermInSetQuery should not try to build ByteRunAutomaton");
+      }
+    });
+
+    // multiple values built into automaton
+    List<BytesRef> terms = new ArrayList<>();
+    for (int i = 0; i < 100; i++) {
+      terms.add(new BytesRef("term" + i));
+    }
+    TermInSetQuery t = new TermInSetQuery("field", terms);
+    t.visit(new QueryVisitor() {
+      @Override
+      public void consumeTerms(Query query, Term... terms) {
+        fail("TermInSetQuery with multiple terms should build automaton");
+      }
+
+      @Override
+      public void consumeTermsMatching(Query query, String field, Supplier<ByteRunAutomaton> automaton) {
+        ByteRunAutomaton a = automaton.get();
+        BytesRef test = new BytesRef("nonmatching");
+        assertFalse(a.run(test.bytes, test.offset, test.length));
+        for (BytesRef term : terms) {
+          assertTrue(a.run(term.bytes, term.offset, term.length));
+        }
+      }
+    });
+  }
 }
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java
@ -19,6 +19,7 @@ package org.apache.lucene.search.uhighlight;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.function.Predicate;
+import java.util.function.Supplier;

 import org.apache.lucene.search.AutomatonQuery;
 import org.apache.lucene.search.BooleanClause;
@ -80,8 +81,8 @@ final class MultiTermHighlighting {
    }

    @Override
-    public void consumeTermsMatching(Query query, String field, ByteRunAutomaton automaton) {
-      runAutomata.add(LabelledCharArrayMatcher.wrap(query.toString(), automaton));
+    public void consumeTermsMatching(Query query, String field, Supplier<ByteRunAutomaton> automaton) {
+      runAutomata.add(LabelledCharArrayMatcher.wrap(query.toString(), automaton.get()));
    }

  }