Enable executing using NFA in RegexpQuery (#12767)

2023-11-09 23:12:42 -08:00 · 2023-11-09 23:12:42 -08:00 · 904a994f66
parent e1af4182d6
commit 904a994f66
5 changed files with 71 additions and 5 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -72,6 +72,8 @@ New Features

 * LUCENE-10010 Introduce NFARunAutomaton to run NFA directly. (Patrick Zhai)

+* GITHUB-12767: Add a flag to enable executing using NFA in RegexpQuery. (Patrick Zhai)
+
 * LUCENE-10626 Hunspell: add tools to aid dictionary editing:
  analysis introspection, stem expansion and stem/flag suggestion (Peter Gromov)

--- a/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java
@ -17,6 +17,7 @@
 package org.apache.lucene.search;

 import org.apache.lucene.index.Term;
+import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.AutomatonProvider;
 import org.apache.lucene.util.automaton.Operations;
 import org.apache.lucene.util.automaton.RegExp;
@ -139,15 +140,60 @@ public class RegexpQuery extends AutomatonQuery {
      AutomatonProvider provider,
      int determinizeWorkLimit,
      RewriteMethod rewriteMethod) {
+    this(term, syntax_flags, match_flags, provider, determinizeWorkLimit, rewriteMethod, true);
+  }
+
+  /**
+   * Constructs a query for terms matching <code>term</code>.
+   *
+   * @param term regular expression.
+   * @param syntax_flags optional RegExp features from {@link RegExp}
+   * @param match_flags boolean 'or' of match behavior options such as case insensitivity
+   * @param provider custom AutomatonProvider for named automata
+   * @param determinizeWorkLimit maximum effort to spend while compiling the automaton from this
+   *     regexp. Set higher to allow more complex queries and lower to prevent memory exhaustion.
+   *     Use {@link Operations#DEFAULT_DETERMINIZE_WORK_LIMIT} as a decent default if you don't
+   *     otherwise know what to specify.
+   * @param rewriteMethod the rewrite method to use to build the final query
+   * @param doDeterminization whether do determinization to force the query to use DFA as
+   *     runAutomaton, if false, the query will not try to determinize the generated automaton from
+   *     regexp such that it might or might not be a DFA. In case it is an NFA, the query will
+   *     eventually use {@link org.apache.lucene.util.automaton.NFARunAutomaton} to execute. Notice
+   *     that {@link org.apache.lucene.util.automaton.NFARunAutomaton} is not thread-safe, so better
+   *     to avoid rewritten method like {@link #CONSTANT_SCORE_BLENDED_REWRITE} when searcher is
+   *     configured with an executor service
+   */
+  public RegexpQuery(
+      Term term,
+      int syntax_flags,
+      int match_flags,
+      AutomatonProvider provider,
+      int determinizeWorkLimit,
+      RewriteMethod rewriteMethod,
+      boolean doDeterminization) {
    super(
        term,
-        Operations.determinize(
-            new RegExp(term.text(), syntax_flags, match_flags).toAutomaton(provider),
-            determinizeWorkLimit),
+        toAutomaton(
+            new RegExp(term.text(), syntax_flags, match_flags),
+            determinizeWorkLimit,
+            provider,
+            doDeterminization),
        false,
        rewriteMethod);
  }

+  private static Automaton toAutomaton(
+      RegExp regexp,
+      int determinizeWorkLimit,
+      AutomatonProvider provider,
+      boolean doDeterminization) {
+    if (doDeterminization) {
+      return Operations.determinize(regexp.toAutomaton(provider), determinizeWorkLimit);
+    } else {
+      return regexp.toAutomaton(provider);
+    }
+  }
+
  /** Returns the regexp of this query wrapped in a Term. */
  public Term getRegexp() {
    return term;
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/NFARunAutomaton.java
@ -25,9 +25,11 @@ import org.apache.lucene.util.hppc.BitMixer;

 /**
 * A RunAutomaton that does not require DFA. It will lazily determinize on-demand, memorizing the
- * generated DFA states that has been explored
+ * generated DFA states that has been explored. Note: the current implementation is NOT thread-safe
 *
 * <p>implemented based on: https://swtch.com/~rsc/regexp/regexp1.html
+ *
+ * @lucene.internal
 */
 public class NFARunAutomaton implements ByteRunnable, TransitionAccessor {

--- a/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java
@ -80,7 +80,10 @@ public class TestRegexpQuery extends LuceneTestCase {
            newTerm(regex),
            RegExp.ALL,
            RegExp.ASCII_CASE_INSENSITIVE,
-            Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
+            RegexpQuery.DEFAULT_PROVIDER,
+            Operations.DEFAULT_DETERMINIZE_WORK_LIMIT,
+            MultiTermQuery.CONSTANT_SCORE_BLENDED_REWRITE,
+            random().nextBoolean());
    return searcher.count(query);
  }

--- a/lucene/core/src/test/org/apache/lucene/search/TestRegexpRandom2.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestRegexpRandom2.java
@ -52,6 +52,7 @@ import org.apache.lucene.util.automaton.RegExp;
 public class TestRegexpRandom2 extends LuceneTestCase {
  protected IndexSearcher searcher1;
  protected IndexSearcher searcher2;
+  protected IndexSearcher searcher3;
  private IndexReader reader;
  private Directory dir;
  protected String fieldName;
@ -95,6 +96,7 @@ public class TestRegexpRandom2 extends LuceneTestCase {
    reader = writer.getReader();
    searcher1 = newSearcher(reader);
    searcher2 = newSearcher(reader);
+    searcher3 = newSearcher(reader);
    writer.close();
  }

@ -172,11 +174,22 @@ public class TestRegexpRandom2 extends LuceneTestCase {
  /** check that the # of hits is the same as from a very simple regexpquery implementation. */
  protected void assertSame(String regexp) throws IOException {
    RegexpQuery smart = new RegexpQuery(new Term(fieldName, regexp), RegExp.NONE);
+    RegexpQuery nfaQuery =
+        new RegexpQuery(
+            new Term(fieldName, regexp),
+            RegExp.NONE,
+            0,
+            RegexpQuery.DEFAULT_PROVIDER,
+            0,
+            MultiTermQuery.CONSTANT_SCORE_BLENDED_REWRITE,
+            false);
    DumbRegexpQuery dumb = new DumbRegexpQuery(new Term(fieldName, regexp), RegExp.NONE);

    TopDocs smartDocs = searcher1.search(smart, 25);
    TopDocs dumbDocs = searcher2.search(dumb, 25);
+    TopDocs nfaDocs = searcher3.search(nfaQuery, 25);

    CheckHits.checkEqual(smart, smartDocs.scoreDocs, dumbDocs.scoreDocs);
+    CheckHits.checkEqual(nfaQuery, nfaDocs.scoreDocs, dumbDocs.scoreDocs);
  }
 }