Add deprecated complement (~) operator to RegExp (#13739)

Previously all regexp parsing required determinization and minimization up-front, which can be costly (exponential time).

Lucene 10 removes the determinization and minimization from RegExp and allows the user to choose:

* determinize() the result and get the DFA query execution of previous releases.
* don't determinize() and possibly get a new NFA query that determinizes-as-it-goes.

Complement of arbitrary automata is incompatible with this choice, as it requires determinization for correctness. It was previously a non-default operator that could be enabled with a special flag: RegExp.COMPLEMENT, or would be included with RegExp.ALL, which turns on all special syntax flags. Lucene 10 removed the operator, as it can't be supported while still giving the user the NFA/DFA choice, and requires exponential time during parsing.

To ease transition: add RegExp.DEPRECATED_COMPLEMENT syntax flag and Kind.DEPRECATED_COMPLEMENT node:
* syntax flag can be enabled with RegExp(s, RegExp.DEPRECATED_COMPLEMENT);
* syntax flag is **NOT** included by RegExp.ALL: e.g. you must do RegExp(s, RegExp.ALL | RegExp.DEPRECATED_COMPLEMENT) to get ALL flags and also the deprecated complement (~) operator. This enforces a java deprecation reference in the calling code to enable the flag.
* deprecated complement (~) runs with an internal limit: Operations.DEFAULT_DETERMINIZE_WORK_LIMIT. It is not configurable. If it is exceeded, you get TooComplexToDeterminize() exception.
* there is intentionally only a single dead-simple test so that this hack doesn't cause us pain with CI/builds. We don't want random automata testing to only rarely encounter an exponential algorithm!

After lucene 10 is branched, this deprecated support can be removed by reverting this commit.
This commit is contained in:
Robert Muir 2024-09-09 08:05:00 -04:00 committed by GitHub
parent dc47adbbe7
commit 22bbc603b4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 61 additions and 5 deletions

View File

@ -368,7 +368,7 @@ public class RegExp {
REGEXP_REPEAT_MIN,
/** An expression that repeats a minimum and maximum number of times */
REGEXP_REPEAT_MINMAX,
/** The complement of an expression */
/** The complement of a character class */
REGEXP_COMPLEMENT,
/** A Character */
REGEXP_CHAR,
@ -387,7 +387,14 @@ public class RegExp {
/** An Interval expression */
REGEXP_INTERVAL,
/** An expression for a pre-defined class e.g. \w */
REGEXP_PRE_CLASS
REGEXP_PRE_CLASS,
/**
* The complement of an expression.
*
* @deprecated Will be removed in Lucene 11
*/
@Deprecated
REGEXP_DEPRECATED_COMPLEMENT
}
// ----- Syntax flags ( <= 0xff ) ------
@ -412,11 +419,23 @@ public class RegExp {
/** Syntax flag, enables no optional regexp syntax. */
public static final int NONE = 0x0000;
// ----- Matching flags ( > 0xff ) ------
// ----- Matching flags ( > 0xff <= 0xffff ) ------
/** Allows case insensitive matching of ASCII characters. */
public static final int ASCII_CASE_INSENSITIVE = 0x0100;
// ----- Deprecated flags ( > 0xffff ) ------
/**
* Allows regexp parsing of the complement (<code>~</code>).
*
* <p>Note that processing the complement can require exponential time, but will be bounded by an
* internal limit. Regexes exceeding the limit will fail with TooComplexToDeterminizeException.
*
* @deprecated This method will be removed in Lucene 11
*/
@Deprecated public static final int DEPRECATED_COMPLEMENT = 0x10000;
// Immutable parsed state
/** The type of expression */
public final Kind kind;
@ -471,7 +490,7 @@ public class RegExp {
* @exception IllegalArgumentException if an error occurred while parsing the regular expression
*/
public RegExp(String s, int syntax_flags, int match_flags) throws IllegalArgumentException {
if (syntax_flags > ALL) {
if ((syntax_flags & ~DEPRECATED_COMPLEMENT) > ALL) {
throw new IllegalArgumentException("Illegal syntax flag");
}
@ -621,6 +640,12 @@ public class RegExp {
a = exp1.toAutomaton(automata, automaton_provider);
a = Operations.complement(a, Integer.MAX_VALUE);
break;
case REGEXP_DEPRECATED_COMPLEMENT:
// to ease transitions for users only, support arbitrary complement
// but bounded by DEFAULT_DETERMINIZE_WORK_LIMIT: must not be configurable.
a = exp1.toAutomaton(automata, automaton_provider);
a = Operations.complement(a, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
break;
case REGEXP_CHAR:
if (check(ASCII_CASE_INSENSITIVE)) {
a = toCaseInsensitiveChar(c);
@ -768,6 +793,7 @@ public class RegExp {
b.append("){").append(min).append(",").append(max).append("}");
break;
case REGEXP_COMPLEMENT:
case REGEXP_DEPRECATED_COMPLEMENT:
b.append("~(");
exp1.toStringBuilder(b);
b.append(")");
@ -831,6 +857,7 @@ public class RegExp {
case REGEXP_OPTIONAL:
case REGEXP_REPEAT:
case REGEXP_COMPLEMENT:
case REGEXP_DEPRECATED_COMPLEMENT:
b.append(indent);
b.append(kind);
b.append('\n');
@ -935,6 +962,7 @@ public class RegExp {
case REGEXP_REPEAT_MIN:
case REGEXP_REPEAT_MINMAX:
case REGEXP_COMPLEMENT:
case REGEXP_DEPRECATED_COMPLEMENT:
exp1.getIdentifiers(set);
break;
case REGEXP_AUTOMATON:
@ -1011,6 +1039,16 @@ public class RegExp {
return newContainerNode(flags, Kind.REGEXP_COMPLEMENT, exp, null);
}
/**
* Creates node that will compute complement of arbitrary expression.
*
* @deprecated Will be removed in Lucene 11
*/
@Deprecated
static RegExp makeDeprecatedComplement(int flags, RegExp exp) {
return newContainerNode(flags, Kind.REGEXP_DEPRECATED_COMPLEMENT, exp, null);
}
static RegExp makeChar(int flags, int c) {
return newLeafNode(flags, Kind.REGEXP_CHAR, null, c, 0, 0, 0, 0, 0);
}
@ -1140,7 +1178,9 @@ public class RegExp {
}
final RegExp parseComplExp() throws IllegalArgumentException {
return parseCharClassExp();
if (check(DEPRECATED_COMPLEMENT) && match('~'))
return makeDeprecatedComplement(flags, parseComplExp());
else return parseCharClassExp();
}
final RegExp parseCharClassExp() throws IllegalArgumentException {

View File

@ -19,6 +19,7 @@ package org.apache.lucene.util.automaton;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.tests.util.automaton.AutomatonTestUtil;
import org.apache.lucene.util.BytesRef;
public class TestRegExp extends LuceneTestCase {
@ -259,4 +260,19 @@ public class TestRegExp extends LuceneTestCase {
public void testRegExpNoStackOverflow() {
new RegExp("(a)|".repeat(50000) + "(a)");
}
/**
* Tests the deprecate complement flag. Keep the simple test only, no random tests to let it cause
* us pain.
*
* @deprecated Remove in Lucene 11
*/
@Deprecated
public void testDeprecatedComplement() {
Automaton expected =
Operations.complement(
Automata.makeString("abcd"), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
Automaton actual = new RegExp("~(abcd)", RegExp.DEPRECATED_COMPLEMENT).toAutomaton();
assertTrue(AutomatonTestUtil.sameLanguage(expected, actual));
}
}