mirror of https://github.com/apache/lucene.git
Enable executing using NFA in RegexpQuery (#12767)
This commit is contained in:
parent
e1af4182d6
commit
904a994f66
|
@ -72,6 +72,8 @@ New Features
|
|||
|
||||
* LUCENE-10010 Introduce NFARunAutomaton to run NFA directly. (Patrick Zhai)
|
||||
|
||||
* GITHUB-12767: Add a flag to enable executing using NFA in RegexpQuery. (Patrick Zhai)
|
||||
|
||||
* LUCENE-10626 Hunspell: add tools to aid dictionary editing:
|
||||
analysis introspection, stem expansion and stem/flag suggestion (Peter Gromov)
|
||||
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.AutomatonProvider;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
|
@ -139,15 +140,60 @@ public class RegexpQuery extends AutomatonQuery {
|
|||
AutomatonProvider provider,
|
||||
int determinizeWorkLimit,
|
||||
RewriteMethod rewriteMethod) {
|
||||
this(term, syntax_flags, match_flags, provider, determinizeWorkLimit, rewriteMethod, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a query for terms matching <code>term</code>.
|
||||
*
|
||||
* @param term regular expression.
|
||||
* @param syntax_flags optional RegExp features from {@link RegExp}
|
||||
* @param match_flags boolean 'or' of match behavior options such as case insensitivity
|
||||
* @param provider custom AutomatonProvider for named automata
|
||||
* @param determinizeWorkLimit maximum effort to spend while compiling the automaton from this
|
||||
* regexp. Set higher to allow more complex queries and lower to prevent memory exhaustion.
|
||||
* Use {@link Operations#DEFAULT_DETERMINIZE_WORK_LIMIT} as a decent default if you don't
|
||||
* otherwise know what to specify.
|
||||
* @param rewriteMethod the rewrite method to use to build the final query
|
||||
* @param doDeterminization whether do determinization to force the query to use DFA as
|
||||
* runAutomaton, if false, the query will not try to determinize the generated automaton from
|
||||
* regexp such that it might or might not be a DFA. In case it is an NFA, the query will
|
||||
* eventually use {@link org.apache.lucene.util.automaton.NFARunAutomaton} to execute. Notice
|
||||
* that {@link org.apache.lucene.util.automaton.NFARunAutomaton} is not thread-safe, so better
|
||||
* to avoid rewritten method like {@link #CONSTANT_SCORE_BLENDED_REWRITE} when searcher is
|
||||
* configured with an executor service
|
||||
*/
|
||||
public RegexpQuery(
|
||||
Term term,
|
||||
int syntax_flags,
|
||||
int match_flags,
|
||||
AutomatonProvider provider,
|
||||
int determinizeWorkLimit,
|
||||
RewriteMethod rewriteMethod,
|
||||
boolean doDeterminization) {
|
||||
super(
|
||||
term,
|
||||
Operations.determinize(
|
||||
new RegExp(term.text(), syntax_flags, match_flags).toAutomaton(provider),
|
||||
determinizeWorkLimit),
|
||||
toAutomaton(
|
||||
new RegExp(term.text(), syntax_flags, match_flags),
|
||||
determinizeWorkLimit,
|
||||
provider,
|
||||
doDeterminization),
|
||||
false,
|
||||
rewriteMethod);
|
||||
}
|
||||
|
||||
private static Automaton toAutomaton(
|
||||
RegExp regexp,
|
||||
int determinizeWorkLimit,
|
||||
AutomatonProvider provider,
|
||||
boolean doDeterminization) {
|
||||
if (doDeterminization) {
|
||||
return Operations.determinize(regexp.toAutomaton(provider), determinizeWorkLimit);
|
||||
} else {
|
||||
return regexp.toAutomaton(provider);
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns the regexp of this query wrapped in a Term. */
|
||||
public Term getRegexp() {
|
||||
return term;
|
||||
|
|
|
@ -25,9 +25,11 @@ import org.apache.lucene.util.hppc.BitMixer;
|
|||
|
||||
/**
|
||||
* A RunAutomaton that does not require DFA. It will lazily determinize on-demand, memorizing the
|
||||
* generated DFA states that has been explored
|
||||
* generated DFA states that has been explored. Note: the current implementation is NOT thread-safe
|
||||
*
|
||||
* <p>implemented based on: https://swtch.com/~rsc/regexp/regexp1.html
|
||||
*
|
||||
* @lucene.internal
|
||||
*/
|
||||
public class NFARunAutomaton implements ByteRunnable, TransitionAccessor {
|
||||
|
||||
|
|
|
@ -80,7 +80,10 @@ public class TestRegexpQuery extends LuceneTestCase {
|
|||
newTerm(regex),
|
||||
RegExp.ALL,
|
||||
RegExp.ASCII_CASE_INSENSITIVE,
|
||||
Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
|
||||
RegexpQuery.DEFAULT_PROVIDER,
|
||||
Operations.DEFAULT_DETERMINIZE_WORK_LIMIT,
|
||||
MultiTermQuery.CONSTANT_SCORE_BLENDED_REWRITE,
|
||||
random().nextBoolean());
|
||||
return searcher.count(query);
|
||||
}
|
||||
|
||||
|
|
|
@ -52,6 +52,7 @@ import org.apache.lucene.util.automaton.RegExp;
|
|||
public class TestRegexpRandom2 extends LuceneTestCase {
|
||||
protected IndexSearcher searcher1;
|
||||
protected IndexSearcher searcher2;
|
||||
protected IndexSearcher searcher3;
|
||||
private IndexReader reader;
|
||||
private Directory dir;
|
||||
protected String fieldName;
|
||||
|
@ -95,6 +96,7 @@ public class TestRegexpRandom2 extends LuceneTestCase {
|
|||
reader = writer.getReader();
|
||||
searcher1 = newSearcher(reader);
|
||||
searcher2 = newSearcher(reader);
|
||||
searcher3 = newSearcher(reader);
|
||||
writer.close();
|
||||
}
|
||||
|
||||
|
@ -172,11 +174,22 @@ public class TestRegexpRandom2 extends LuceneTestCase {
|
|||
/** check that the # of hits is the same as from a very simple regexpquery implementation. */
|
||||
protected void assertSame(String regexp) throws IOException {
|
||||
RegexpQuery smart = new RegexpQuery(new Term(fieldName, regexp), RegExp.NONE);
|
||||
RegexpQuery nfaQuery =
|
||||
new RegexpQuery(
|
||||
new Term(fieldName, regexp),
|
||||
RegExp.NONE,
|
||||
0,
|
||||
RegexpQuery.DEFAULT_PROVIDER,
|
||||
0,
|
||||
MultiTermQuery.CONSTANT_SCORE_BLENDED_REWRITE,
|
||||
false);
|
||||
DumbRegexpQuery dumb = new DumbRegexpQuery(new Term(fieldName, regexp), RegExp.NONE);
|
||||
|
||||
TopDocs smartDocs = searcher1.search(smart, 25);
|
||||
TopDocs dumbDocs = searcher2.search(dumb, 25);
|
||||
TopDocs nfaDocs = searcher3.search(nfaQuery, 25);
|
||||
|
||||
CheckHits.checkEqual(smart, smartDocs.scoreDocs, dumbDocs.scoreDocs);
|
||||
CheckHits.checkEqual(nfaQuery, nfaDocs.scoreDocs, dumbDocs.scoreDocs);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue