From 7a872c7a5c00d846314d44a445f8b0e83acb6a86 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Wed, 8 Dec 2021 21:44:26 -0500 Subject: [PATCH] LUCENE-10296: Stop minimizing regepx (#528) In current trunk, we let caller (e.g. RegExpQuery) try to "reduce" the expression. The parser nor the low-level executors don't implicitly call exponential-time algorithms anymore. But now that we have cleaned this up, we can see it is even worse than just calling determinize(). We still call minimize() which is much crazier and much more. We stopped doing this for all other AutomatonQuery subclasses a long time ago, as we determined that it didn't help performance. Additionally, minimization vs. determinization is even less important than early days where we found trouble: the representation got a lot better. Today when you finishState we do a lot of practical sorting/coalescing on-the-fly. Also we added this fancy UTF32-to-UTF8 automata convertor, that makes the worst-case-space-per-state significantly lower than it was before? So why minimize() ? Let's just replace minimize() calls with determinize() calls? I've already swapped them out for all of src/test, to get jenkins looking for issues ahead of time. This change moves hopcroft minimization (MinimizeOperations) to src/test for now. I'd like to explore nuking it from there as a next step, any tests that truly need minimization should be fine with brzozowski's algorithm. --- .../org/apache/lucene/analysis/hunspell/AffixCondition.java | 3 +-- .../lucene/analysis/pattern/SimplePatternSplitTokenizer.java | 5 +---- .../analysis/pattern/SimplePatternSplitTokenizerFactory.java | 3 +-- .../core/src/java/org/apache/lucene/search/RegexpQuery.java | 3 +-- .../apache/lucene/util/automaton/MinimizationOperations.java | 0 .../lucene/search/suggest/document/RegexCompletionQuery.java | 5 ++--- 6 files changed, 6 insertions(+), 13 deletions(-) rename lucene/core/src/{java => test}/org/apache/lucene/util/automaton/MinimizationOperations.java (100%) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixCondition.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixCondition.java index 42c4ab0834a..9686f68e102 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixCondition.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixCondition.java @@ -21,7 +21,6 @@ import static org.apache.lucene.analysis.hunspell.AffixKind.SUFFIX; import java.util.regex.PatternSyntaxException; import org.apache.lucene.util.automaton.CharacterRunAutomaton; -import org.apache.lucene.util.automaton.MinimizationOperations; import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.RegExp; @@ -156,7 +155,7 @@ interface AffixCondition { boolean forSuffix = kind == AffixKind.SUFFIX; CharacterRunAutomaton automaton = new CharacterRunAutomaton( - MinimizationOperations.minimize( + Operations.determinize( new RegExp(escapeDash(condition), RegExp.NONE).toAutomaton(), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT)); return (word, offset, length) -> diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java index 8062dad8b6b..1914875560d 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java @@ -24,7 +24,6 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.AttributeFactory; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.CharacterRunAutomaton; -import org.apache.lucene.util.automaton.MinimizationOperations; import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.RegExp; @@ -75,9 +74,7 @@ public final class SimplePatternSplitTokenizer extends Tokenizer { /** See {@link RegExp} for the accepted syntax. */ public SimplePatternSplitTokenizer( AttributeFactory factory, String regexp, int determinizeWorkLimit) { - this( - factory, - MinimizationOperations.minimize(new RegExp(regexp).toAutomaton(), determinizeWorkLimit)); + this(factory, Operations.determinize(new RegExp(regexp).toAutomaton(), determinizeWorkLimit)); } /** Runs a pre-built automaton. */ diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizerFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizerFactory.java index 2e510e62027..7472bba5848 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizerFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizerFactory.java @@ -21,7 +21,6 @@ import java.util.Map; import org.apache.lucene.analysis.TokenizerFactory; import org.apache.lucene.util.AttributeFactory; import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.MinimizationOperations; import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.RegExp; @@ -74,7 +73,7 @@ public class SimplePatternSplitTokenizerFactory extends TokenizerFactory { determinizeWorkLimit = getInt(args, "determinizeWorkLimit", Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); dfa = - MinimizationOperations.minimize( + Operations.determinize( new RegExp(require(args, PATTERN)).toAutomaton(), determinizeWorkLimit); if (args.isEmpty() == false) { throw new IllegalArgumentException("Unknown parameters: " + args); diff --git a/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java b/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java index bc2dfc50200..e786ebc8430 100644 --- a/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java @@ -19,7 +19,6 @@ package org.apache.lucene.search; import org.apache.lucene.index.Term; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.AutomatonProvider; -import org.apache.lucene.util.automaton.MinimizationOperations; import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.RegExp; @@ -140,7 +139,7 @@ public class RegexpQuery extends AutomatonQuery { int determinizeWorkLimit) { super( term, - MinimizationOperations.minimize( + Operations.determinize( new RegExp(term.text(), syntax_flags, match_flags).toAutomaton(provider), determinizeWorkLimit)); } diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java b/lucene/core/src/test/org/apache/lucene/util/automaton/MinimizationOperations.java similarity index 100% rename from lucene/core/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java rename to lucene/core/src/test/org/apache/lucene/util/automaton/MinimizationOperations.java diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/RegexCompletionQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/RegexCompletionQuery.java index 324312d0436..7ee27bb0d1d 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/RegexCompletionQuery.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/RegexCompletionQuery.java @@ -25,7 +25,6 @@ import org.apache.lucene.search.Weight; import org.apache.lucene.search.suggest.BitsProducer; import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.MinimizationOperations; import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.RegExp; @@ -75,7 +74,7 @@ public class RegexCompletionQuery extends CompletionQuery { * @param term query is run against {@link Term#field()} and {@link Term#text()} is interpreted as * a regular expression * @param flags used as syntax_flag in {@link RegExp#RegExp(String, int)} - * @param determinizeWorkLimit used in {@link MinimizationOperations#minimize(Automaton, int)} + * @param determinizeWorkLimit used in {@link Operations#determinize(Automaton, int)} * @param filter used to query on a sub set of documents */ public RegexCompletionQuery(Term term, int flags, int determinizeWorkLimit, BitsProducer filter) { @@ -92,7 +91,7 @@ public class RegexCompletionQuery extends CompletionQuery { Automaton automaton = getTerm().text().isEmpty() ? Automata.makeEmpty() - : MinimizationOperations.minimize( + : Operations.determinize( new RegExp(getTerm().text(), flags).toAutomaton(), determinizeWorkLimit); return new CompletionWeight(this, automaton); }