Enable executing using NFA in RegexpQuery (#12767)

This commit is contained in:
Patrick Zhai 2023-11-09 23:12:42 -08:00 committed by GitHub
parent e1af4182d6
commit 904a994f66
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 71 additions and 5 deletions

View File

@ -72,6 +72,8 @@ New Features
* LUCENE-10010 Introduce NFARunAutomaton to run NFA directly. (Patrick Zhai)
* GITHUB-12767: Add a flag to enable executing using NFA in RegexpQuery. (Patrick Zhai)
* LUCENE-10626 Hunspell: add tools to aid dictionary editing:
analysis introspection, stem expansion and stem/flag suggestion (Peter Gromov)

View File

@ -17,6 +17,7 @@
package org.apache.lucene.search;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.AutomatonProvider;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;
@ -139,15 +140,60 @@ public class RegexpQuery extends AutomatonQuery {
AutomatonProvider provider,
int determinizeWorkLimit,
RewriteMethod rewriteMethod) {
this(term, syntax_flags, match_flags, provider, determinizeWorkLimit, rewriteMethod, true);
}
/**
* Constructs a query for terms matching <code>term</code>.
*
* @param term regular expression.
* @param syntax_flags optional RegExp features from {@link RegExp}
* @param match_flags boolean 'or' of match behavior options such as case insensitivity
* @param provider custom AutomatonProvider for named automata
* @param determinizeWorkLimit maximum effort to spend while compiling the automaton from this
* regexp. Set higher to allow more complex queries and lower to prevent memory exhaustion.
* Use {@link Operations#DEFAULT_DETERMINIZE_WORK_LIMIT} as a decent default if you don't
* otherwise know what to specify.
* @param rewriteMethod the rewrite method to use to build the final query
* @param doDeterminization whether do determinization to force the query to use DFA as
* runAutomaton, if false, the query will not try to determinize the generated automaton from
* regexp such that it might or might not be a DFA. In case it is an NFA, the query will
* eventually use {@link org.apache.lucene.util.automaton.NFARunAutomaton} to execute. Notice
* that {@link org.apache.lucene.util.automaton.NFARunAutomaton} is not thread-safe, so better
* to avoid rewritten method like {@link #CONSTANT_SCORE_BLENDED_REWRITE} when searcher is
* configured with an executor service
*/
public RegexpQuery(
Term term,
int syntax_flags,
int match_flags,
AutomatonProvider provider,
int determinizeWorkLimit,
RewriteMethod rewriteMethod,
boolean doDeterminization) {
super(
term,
Operations.determinize(
new RegExp(term.text(), syntax_flags, match_flags).toAutomaton(provider),
determinizeWorkLimit),
toAutomaton(
new RegExp(term.text(), syntax_flags, match_flags),
determinizeWorkLimit,
provider,
doDeterminization),
false,
rewriteMethod);
}
private static Automaton toAutomaton(
RegExp regexp,
int determinizeWorkLimit,
AutomatonProvider provider,
boolean doDeterminization) {
if (doDeterminization) {
return Operations.determinize(regexp.toAutomaton(provider), determinizeWorkLimit);
} else {
return regexp.toAutomaton(provider);
}
}
/** Returns the regexp of this query wrapped in a Term. */
public Term getRegexp() {
return term;

View File

@ -25,9 +25,11 @@ import org.apache.lucene.util.hppc.BitMixer;
/**
* A RunAutomaton that does not require DFA. It will lazily determinize on-demand, memorizing the
* generated DFA states that has been explored
* generated DFA states that has been explored. Note: the current implementation is NOT thread-safe
*
* <p>implemented based on: https://swtch.com/~rsc/regexp/regexp1.html
*
* @lucene.internal
*/
public class NFARunAutomaton implements ByteRunnable, TransitionAccessor {

View File

@ -80,7 +80,10 @@ public class TestRegexpQuery extends LuceneTestCase {
newTerm(regex),
RegExp.ALL,
RegExp.ASCII_CASE_INSENSITIVE,
Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
RegexpQuery.DEFAULT_PROVIDER,
Operations.DEFAULT_DETERMINIZE_WORK_LIMIT,
MultiTermQuery.CONSTANT_SCORE_BLENDED_REWRITE,
random().nextBoolean());
return searcher.count(query);
}

View File

@ -52,6 +52,7 @@ import org.apache.lucene.util.automaton.RegExp;
public class TestRegexpRandom2 extends LuceneTestCase {
protected IndexSearcher searcher1;
protected IndexSearcher searcher2;
protected IndexSearcher searcher3;
private IndexReader reader;
private Directory dir;
protected String fieldName;
@ -95,6 +96,7 @@ public class TestRegexpRandom2 extends LuceneTestCase {
reader = writer.getReader();
searcher1 = newSearcher(reader);
searcher2 = newSearcher(reader);
searcher3 = newSearcher(reader);
writer.close();
}
@ -172,11 +174,22 @@ public class TestRegexpRandom2 extends LuceneTestCase {
/** check that the # of hits is the same as from a very simple regexpquery implementation. */
protected void assertSame(String regexp) throws IOException {
RegexpQuery smart = new RegexpQuery(new Term(fieldName, regexp), RegExp.NONE);
RegexpQuery nfaQuery =
new RegexpQuery(
new Term(fieldName, regexp),
RegExp.NONE,
0,
RegexpQuery.DEFAULT_PROVIDER,
0,
MultiTermQuery.CONSTANT_SCORE_BLENDED_REWRITE,
false);
DumbRegexpQuery dumb = new DumbRegexpQuery(new Term(fieldName, regexp), RegExp.NONE);
TopDocs smartDocs = searcher1.search(smart, 25);
TopDocs dumbDocs = searcher2.search(dumb, 25);
TopDocs nfaDocs = searcher3.search(nfaQuery, 25);
CheckHits.checkEqual(smart, smartDocs.scoreDocs, dumbDocs.scoreDocs);
CheckHits.checkEqual(nfaQuery, nfaDocs.scoreDocs, dumbDocs.scoreDocs);
}
}