LUCENE-10296: Stop minimizing regepx (#528)

In current trunk, we let caller (e.g. RegExpQuery) try to "reduce" the expression. The parser nor the low-level executors don't implicitly call exponential-time algorithms anymore.

But now that we have cleaned this up, we can see it is even worse than just calling determinize(). We still call minimize() which is much crazier and much more.

We stopped doing this for all other AutomatonQuery subclasses a long time ago, as we determined that it didn't help performance. Additionally, minimization vs. determinization is even less important than early days where we found trouble: the representation got a lot better. Today when you finishState we do a lot of practical sorting/coalescing on-the-fly. Also we added this fancy UTF32-to-UTF8 automata convertor, that makes the worst-case-space-per-state significantly lower than it was before? So why minimize() ?

Let's just replace minimize() calls with determinize() calls? I've already swapped them out for all of src/test, to get jenkins looking for issues ahead of time.

This change moves hopcroft minimization (MinimizeOperations) to src/test for now. I'd like to explore nuking it from there as a next step, any tests that truly need minimization should be fine with brzozowski's
algorithm.
This commit is contained in:
Robert Muir 2021-12-08 21:44:26 -05:00 committed by GitHub
parent 5d39bca87a
commit 7a872c7a5c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 6 additions and 13 deletions

View File

@ -21,7 +21,6 @@ import static org.apache.lucene.analysis.hunspell.AffixKind.SUFFIX;
import java.util.regex.PatternSyntaxException;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.MinimizationOperations;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;
@ -156,7 +155,7 @@ interface AffixCondition {
boolean forSuffix = kind == AffixKind.SUFFIX;
CharacterRunAutomaton automaton =
new CharacterRunAutomaton(
MinimizationOperations.minimize(
Operations.determinize(
new RegExp(escapeDash(condition), RegExp.NONE).toAutomaton(),
Operations.DEFAULT_DETERMINIZE_WORK_LIMIT));
return (word, offset, length) ->

View File

@ -24,7 +24,6 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.MinimizationOperations;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;
@ -75,9 +74,7 @@ public final class SimplePatternSplitTokenizer extends Tokenizer {
/** See {@link RegExp} for the accepted syntax. */
public SimplePatternSplitTokenizer(
AttributeFactory factory, String regexp, int determinizeWorkLimit) {
this(
factory,
MinimizationOperations.minimize(new RegExp(regexp).toAutomaton(), determinizeWorkLimit));
this(factory, Operations.determinize(new RegExp(regexp).toAutomaton(), determinizeWorkLimit));
}
/** Runs a pre-built automaton. */

View File

@ -21,7 +21,6 @@ import java.util.Map;
import org.apache.lucene.analysis.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.MinimizationOperations;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;
@ -74,7 +73,7 @@ public class SimplePatternSplitTokenizerFactory extends TokenizerFactory {
determinizeWorkLimit =
getInt(args, "determinizeWorkLimit", Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
dfa =
MinimizationOperations.minimize(
Operations.determinize(
new RegExp(require(args, PATTERN)).toAutomaton(), determinizeWorkLimit);
if (args.isEmpty() == false) {
throw new IllegalArgumentException("Unknown parameters: " + args);

View File

@ -19,7 +19,6 @@ package org.apache.lucene.search;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.AutomatonProvider;
import org.apache.lucene.util.automaton.MinimizationOperations;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;
@ -140,7 +139,7 @@ public class RegexpQuery extends AutomatonQuery {
int determinizeWorkLimit) {
super(
term,
MinimizationOperations.minimize(
Operations.determinize(
new RegExp(term.text(), syntax_flags, match_flags).toAutomaton(provider),
determinizeWorkLimit));
}

View File

@ -25,7 +25,6 @@ import org.apache.lucene.search.Weight;
import org.apache.lucene.search.suggest.BitsProducer;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.MinimizationOperations;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;
@ -75,7 +74,7 @@ public class RegexCompletionQuery extends CompletionQuery {
* @param term query is run against {@link Term#field()} and {@link Term#text()} is interpreted as
* a regular expression
* @param flags used as syntax_flag in {@link RegExp#RegExp(String, int)}
* @param determinizeWorkLimit used in {@link MinimizationOperations#minimize(Automaton, int)}
* @param determinizeWorkLimit used in {@link Operations#determinize(Automaton, int)}
* @param filter used to query on a sub set of documents
*/
public RegexCompletionQuery(Term term, int flags, int determinizeWorkLimit, BitsProducer filter) {
@ -92,7 +91,7 @@ public class RegexCompletionQuery extends CompletionQuery {
Automaton automaton =
getTerm().text().isEmpty()
? Automata.makeEmpty()
: MinimizationOperations.minimize(
: Operations.determinize(
new RegExp(getTerm().text(), flags).toAutomaton(), determinizeWorkLimit);
return new CompletionWeight(this, automaton);
}