mirror of https://github.com/apache/lucene.git
Enable executing using NFA in RegexpQuery (#12767)
This commit is contained in:
parent
e1af4182d6
commit
904a994f66
|
@ -72,6 +72,8 @@ New Features
|
||||||
|
|
||||||
* LUCENE-10010 Introduce NFARunAutomaton to run NFA directly. (Patrick Zhai)
|
* LUCENE-10010 Introduce NFARunAutomaton to run NFA directly. (Patrick Zhai)
|
||||||
|
|
||||||
|
* GITHUB-12767: Add a flag to enable executing using NFA in RegexpQuery. (Patrick Zhai)
|
||||||
|
|
||||||
* LUCENE-10626 Hunspell: add tools to aid dictionary editing:
|
* LUCENE-10626 Hunspell: add tools to aid dictionary editing:
|
||||||
analysis introspection, stem expansion and stem/flag suggestion (Peter Gromov)
|
analysis introspection, stem expansion and stem/flag suggestion (Peter Gromov)
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
package org.apache.lucene.search;
|
package org.apache.lucene.search;
|
||||||
|
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.util.automaton.Automaton;
|
||||||
import org.apache.lucene.util.automaton.AutomatonProvider;
|
import org.apache.lucene.util.automaton.AutomatonProvider;
|
||||||
import org.apache.lucene.util.automaton.Operations;
|
import org.apache.lucene.util.automaton.Operations;
|
||||||
import org.apache.lucene.util.automaton.RegExp;
|
import org.apache.lucene.util.automaton.RegExp;
|
||||||
|
@ -139,15 +140,60 @@ public class RegexpQuery extends AutomatonQuery {
|
||||||
AutomatonProvider provider,
|
AutomatonProvider provider,
|
||||||
int determinizeWorkLimit,
|
int determinizeWorkLimit,
|
||||||
RewriteMethod rewriteMethod) {
|
RewriteMethod rewriteMethod) {
|
||||||
|
this(term, syntax_flags, match_flags, provider, determinizeWorkLimit, rewriteMethod, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructs a query for terms matching <code>term</code>.
|
||||||
|
*
|
||||||
|
* @param term regular expression.
|
||||||
|
* @param syntax_flags optional RegExp features from {@link RegExp}
|
||||||
|
* @param match_flags boolean 'or' of match behavior options such as case insensitivity
|
||||||
|
* @param provider custom AutomatonProvider for named automata
|
||||||
|
* @param determinizeWorkLimit maximum effort to spend while compiling the automaton from this
|
||||||
|
* regexp. Set higher to allow more complex queries and lower to prevent memory exhaustion.
|
||||||
|
* Use {@link Operations#DEFAULT_DETERMINIZE_WORK_LIMIT} as a decent default if you don't
|
||||||
|
* otherwise know what to specify.
|
||||||
|
* @param rewriteMethod the rewrite method to use to build the final query
|
||||||
|
* @param doDeterminization whether do determinization to force the query to use DFA as
|
||||||
|
* runAutomaton, if false, the query will not try to determinize the generated automaton from
|
||||||
|
* regexp such that it might or might not be a DFA. In case it is an NFA, the query will
|
||||||
|
* eventually use {@link org.apache.lucene.util.automaton.NFARunAutomaton} to execute. Notice
|
||||||
|
* that {@link org.apache.lucene.util.automaton.NFARunAutomaton} is not thread-safe, so better
|
||||||
|
* to avoid rewritten method like {@link #CONSTANT_SCORE_BLENDED_REWRITE} when searcher is
|
||||||
|
* configured with an executor service
|
||||||
|
*/
|
||||||
|
public RegexpQuery(
|
||||||
|
Term term,
|
||||||
|
int syntax_flags,
|
||||||
|
int match_flags,
|
||||||
|
AutomatonProvider provider,
|
||||||
|
int determinizeWorkLimit,
|
||||||
|
RewriteMethod rewriteMethod,
|
||||||
|
boolean doDeterminization) {
|
||||||
super(
|
super(
|
||||||
term,
|
term,
|
||||||
Operations.determinize(
|
toAutomaton(
|
||||||
new RegExp(term.text(), syntax_flags, match_flags).toAutomaton(provider),
|
new RegExp(term.text(), syntax_flags, match_flags),
|
||||||
determinizeWorkLimit),
|
determinizeWorkLimit,
|
||||||
|
provider,
|
||||||
|
doDeterminization),
|
||||||
false,
|
false,
|
||||||
rewriteMethod);
|
rewriteMethod);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static Automaton toAutomaton(
|
||||||
|
RegExp regexp,
|
||||||
|
int determinizeWorkLimit,
|
||||||
|
AutomatonProvider provider,
|
||||||
|
boolean doDeterminization) {
|
||||||
|
if (doDeterminization) {
|
||||||
|
return Operations.determinize(regexp.toAutomaton(provider), determinizeWorkLimit);
|
||||||
|
} else {
|
||||||
|
return regexp.toAutomaton(provider);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/** Returns the regexp of this query wrapped in a Term. */
|
/** Returns the regexp of this query wrapped in a Term. */
|
||||||
public Term getRegexp() {
|
public Term getRegexp() {
|
||||||
return term;
|
return term;
|
||||||
|
|
|
@ -25,9 +25,11 @@ import org.apache.lucene.util.hppc.BitMixer;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A RunAutomaton that does not require DFA. It will lazily determinize on-demand, memorizing the
|
* A RunAutomaton that does not require DFA. It will lazily determinize on-demand, memorizing the
|
||||||
* generated DFA states that has been explored
|
* generated DFA states that has been explored. Note: the current implementation is NOT thread-safe
|
||||||
*
|
*
|
||||||
* <p>implemented based on: https://swtch.com/~rsc/regexp/regexp1.html
|
* <p>implemented based on: https://swtch.com/~rsc/regexp/regexp1.html
|
||||||
|
*
|
||||||
|
* @lucene.internal
|
||||||
*/
|
*/
|
||||||
public class NFARunAutomaton implements ByteRunnable, TransitionAccessor {
|
public class NFARunAutomaton implements ByteRunnable, TransitionAccessor {
|
||||||
|
|
||||||
|
|
|
@ -80,7 +80,10 @@ public class TestRegexpQuery extends LuceneTestCase {
|
||||||
newTerm(regex),
|
newTerm(regex),
|
||||||
RegExp.ALL,
|
RegExp.ALL,
|
||||||
RegExp.ASCII_CASE_INSENSITIVE,
|
RegExp.ASCII_CASE_INSENSITIVE,
|
||||||
Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
|
RegexpQuery.DEFAULT_PROVIDER,
|
||||||
|
Operations.DEFAULT_DETERMINIZE_WORK_LIMIT,
|
||||||
|
MultiTermQuery.CONSTANT_SCORE_BLENDED_REWRITE,
|
||||||
|
random().nextBoolean());
|
||||||
return searcher.count(query);
|
return searcher.count(query);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -52,6 +52,7 @@ import org.apache.lucene.util.automaton.RegExp;
|
||||||
public class TestRegexpRandom2 extends LuceneTestCase {
|
public class TestRegexpRandom2 extends LuceneTestCase {
|
||||||
protected IndexSearcher searcher1;
|
protected IndexSearcher searcher1;
|
||||||
protected IndexSearcher searcher2;
|
protected IndexSearcher searcher2;
|
||||||
|
protected IndexSearcher searcher3;
|
||||||
private IndexReader reader;
|
private IndexReader reader;
|
||||||
private Directory dir;
|
private Directory dir;
|
||||||
protected String fieldName;
|
protected String fieldName;
|
||||||
|
@ -95,6 +96,7 @@ public class TestRegexpRandom2 extends LuceneTestCase {
|
||||||
reader = writer.getReader();
|
reader = writer.getReader();
|
||||||
searcher1 = newSearcher(reader);
|
searcher1 = newSearcher(reader);
|
||||||
searcher2 = newSearcher(reader);
|
searcher2 = newSearcher(reader);
|
||||||
|
searcher3 = newSearcher(reader);
|
||||||
writer.close();
|
writer.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -172,11 +174,22 @@ public class TestRegexpRandom2 extends LuceneTestCase {
|
||||||
/** check that the # of hits is the same as from a very simple regexpquery implementation. */
|
/** check that the # of hits is the same as from a very simple regexpquery implementation. */
|
||||||
protected void assertSame(String regexp) throws IOException {
|
protected void assertSame(String regexp) throws IOException {
|
||||||
RegexpQuery smart = new RegexpQuery(new Term(fieldName, regexp), RegExp.NONE);
|
RegexpQuery smart = new RegexpQuery(new Term(fieldName, regexp), RegExp.NONE);
|
||||||
|
RegexpQuery nfaQuery =
|
||||||
|
new RegexpQuery(
|
||||||
|
new Term(fieldName, regexp),
|
||||||
|
RegExp.NONE,
|
||||||
|
0,
|
||||||
|
RegexpQuery.DEFAULT_PROVIDER,
|
||||||
|
0,
|
||||||
|
MultiTermQuery.CONSTANT_SCORE_BLENDED_REWRITE,
|
||||||
|
false);
|
||||||
DumbRegexpQuery dumb = new DumbRegexpQuery(new Term(fieldName, regexp), RegExp.NONE);
|
DumbRegexpQuery dumb = new DumbRegexpQuery(new Term(fieldName, regexp), RegExp.NONE);
|
||||||
|
|
||||||
TopDocs smartDocs = searcher1.search(smart, 25);
|
TopDocs smartDocs = searcher1.search(smart, 25);
|
||||||
TopDocs dumbDocs = searcher2.search(dumb, 25);
|
TopDocs dumbDocs = searcher2.search(dumb, 25);
|
||||||
|
TopDocs nfaDocs = searcher3.search(nfaQuery, 25);
|
||||||
|
|
||||||
CheckHits.checkEqual(smart, smartDocs.scoreDocs, dumbDocs.scoreDocs);
|
CheckHits.checkEqual(smart, smartDocs.scoreDocs, dumbDocs.scoreDocs);
|
||||||
|
CheckHits.checkEqual(nfaQuery, nfaDocs.scoreDocs, dumbDocs.scoreDocs);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue