mirror of https://github.com/apache/lucene.git
LUCENE-9386 add case insensitive RegExp matching option (#1541)
Added case insensitive search option (currently only works with ASCII characters)
This commit is contained in:
parent
00203c292f
commit
887fe4c83d
|
@ -83,7 +83,7 @@ public class RegexpQuery extends AutomatonQuery {
|
||||||
* Constructs a query for terms matching <code>term</code>.
|
* Constructs a query for terms matching <code>term</code>.
|
||||||
*
|
*
|
||||||
* @param term regular expression.
|
* @param term regular expression.
|
||||||
* @param flags optional RegExp features from {@link RegExp}
|
* @param flags optional RegExp syntax features from {@link RegExp}
|
||||||
* @param maxDeterminizedStates maximum number of states that compiling the
|
* @param maxDeterminizedStates maximum number of states that compiling the
|
||||||
* automaton for the regexp can result in. Set higher to allow more complex
|
* automaton for the regexp can result in. Set higher to allow more complex
|
||||||
* queries and lower to prevent memory exhaustion.
|
* queries and lower to prevent memory exhaustion.
|
||||||
|
@ -96,16 +96,46 @@ public class RegexpQuery extends AutomatonQuery {
|
||||||
* Constructs a query for terms matching <code>term</code>.
|
* Constructs a query for terms matching <code>term</code>.
|
||||||
*
|
*
|
||||||
* @param term regular expression.
|
* @param term regular expression.
|
||||||
* @param flags optional RegExp features from {@link RegExp}
|
* @param syntax_flags optional RegExp syntax features from {@link RegExp}
|
||||||
|
* automaton for the regexp can result in. Set higher to allow more complex
|
||||||
|
* queries and lower to prevent memory exhaustion.
|
||||||
|
* @param match_flags boolean 'or' of match behavior options such as case insensitivity
|
||||||
|
* @param maxDeterminizedStates maximum number of states that compiling the
|
||||||
|
*/
|
||||||
|
public RegexpQuery(Term term, int syntax_flags, int match_flags, int maxDeterminizedStates) {
|
||||||
|
this(term, syntax_flags, match_flags, defaultProvider, maxDeterminizedStates);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructs a query for terms matching <code>term</code>.
|
||||||
|
*
|
||||||
|
* @param term regular expression.
|
||||||
|
* @param syntax_flags optional RegExp features from {@link RegExp}
|
||||||
* @param provider custom AutomatonProvider for named automata
|
* @param provider custom AutomatonProvider for named automata
|
||||||
* @param maxDeterminizedStates maximum number of states that compiling the
|
* @param maxDeterminizedStates maximum number of states that compiling the
|
||||||
* automaton for the regexp can result in. Set higher to allow more complex
|
* automaton for the regexp can result in. Set higher to allow more complex
|
||||||
* queries and lower to prevent memory exhaustion.
|
* queries and lower to prevent memory exhaustion.
|
||||||
*/
|
*/
|
||||||
public RegexpQuery(Term term, int flags, AutomatonProvider provider,
|
public RegexpQuery(Term term, int syntax_flags, AutomatonProvider provider,
|
||||||
|
int maxDeterminizedStates) {
|
||||||
|
this(term, syntax_flags, 0, provider, maxDeterminizedStates);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructs a query for terms matching <code>term</code>.
|
||||||
|
*
|
||||||
|
* @param term regular expression.
|
||||||
|
* @param syntax_flags optional RegExp features from {@link RegExp}
|
||||||
|
* @param match_flags boolean 'or' of match behavior options such as case insensitivity
|
||||||
|
* @param provider custom AutomatonProvider for named automata
|
||||||
|
* @param maxDeterminizedStates maximum number of states that compiling the
|
||||||
|
* automaton for the regexp can result in. Set higher to allow more complex
|
||||||
|
* queries and lower to prevent memory exhaustion.
|
||||||
|
*/
|
||||||
|
public RegexpQuery(Term term, int syntax_flags, int match_flags, AutomatonProvider provider,
|
||||||
int maxDeterminizedStates) {
|
int maxDeterminizedStates) {
|
||||||
super(term,
|
super(term,
|
||||||
new RegExp(term.text(), flags).toAutomaton(
|
new RegExp(term.text(), syntax_flags, match_flags).toAutomaton(
|
||||||
provider, maxDeterminizedStates), maxDeterminizedStates);
|
provider, maxDeterminizedStates), maxDeterminizedStates);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -32,6 +32,7 @@ package org.apache.lucene.util.automaton;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
@ -405,6 +406,7 @@ public class RegExp {
|
||||||
REGEXP_PRE_CLASS
|
REGEXP_PRE_CLASS
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//----- Syntax flags ( <= 0xff ) ------
|
||||||
/**
|
/**
|
||||||
* Syntax flag, enables intersection (<code>&</code>).
|
* Syntax flag, enables intersection (<code>&</code>).
|
||||||
*/
|
*/
|
||||||
|
@ -439,13 +441,20 @@ public class RegExp {
|
||||||
/**
|
/**
|
||||||
* Syntax flag, enables all optional regexp syntax.
|
* Syntax flag, enables all optional regexp syntax.
|
||||||
*/
|
*/
|
||||||
public static final int ALL = 0xffff;
|
public static final int ALL = 0xff;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Syntax flag, enables no optional regexp syntax.
|
* Syntax flag, enables no optional regexp syntax.
|
||||||
*/
|
*/
|
||||||
public static final int NONE = 0x0000;
|
public static final int NONE = 0x0000;
|
||||||
|
|
||||||
|
//----- Matching flags ( > 0xff ) ------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Allows case insensitive matching of ASCII characters.
|
||||||
|
*/
|
||||||
|
public static final int ASCII_CASE_INSENSITIVE = 0x0100;
|
||||||
|
|
||||||
//Immutable parsed state
|
//Immutable parsed state
|
||||||
/**
|
/**
|
||||||
* The type of expression
|
* The type of expression
|
||||||
|
@ -474,7 +483,7 @@ public class RegExp {
|
||||||
|
|
||||||
// Parser variables
|
// Parser variables
|
||||||
private final String originalString;
|
private final String originalString;
|
||||||
int flags;
|
final int flags;
|
||||||
int pos;
|
int pos;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -499,10 +508,30 @@ public class RegExp {
|
||||||
* regular expression
|
* regular expression
|
||||||
*/
|
*/
|
||||||
public RegExp(String s, int syntax_flags) throws IllegalArgumentException {
|
public RegExp(String s, int syntax_flags) throws IllegalArgumentException {
|
||||||
|
this(s, syntax_flags, 0);
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Constructs new <code>RegExp</code> from a string.
|
||||||
|
*
|
||||||
|
* @param s regexp string
|
||||||
|
* @param syntax_flags boolean 'or' of optional syntax constructs to be
|
||||||
|
* enabled
|
||||||
|
* @param match_flags boolean 'or' of match behavior options such as case insensitivity
|
||||||
|
* @exception IllegalArgumentException if an error occurred while parsing the
|
||||||
|
* regular expression
|
||||||
|
*/
|
||||||
|
public RegExp(String s, int syntax_flags, int match_flags) throws IllegalArgumentException {
|
||||||
|
if (syntax_flags > ALL) {
|
||||||
|
throw new IllegalArgumentException("Illegal syntax flag");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (match_flags > 0 && match_flags <= ALL) {
|
||||||
|
throw new IllegalArgumentException("Illegal match flag");
|
||||||
|
}
|
||||||
|
flags = syntax_flags | match_flags;
|
||||||
originalString = s;
|
originalString = s;
|
||||||
flags = syntax_flags;
|
|
||||||
RegExp e;
|
RegExp e;
|
||||||
if (s.length() == 0) e = makeString("");
|
if (s.length() == 0) e = makeString(flags, "");
|
||||||
else {
|
else {
|
||||||
e = parseUnionExp();
|
e = parseUnionExp();
|
||||||
if (pos < originalString.length()) throw new IllegalArgumentException(
|
if (pos < originalString.length()) throw new IllegalArgumentException(
|
||||||
|
@ -520,10 +549,10 @@ public class RegExp {
|
||||||
to = e.to;
|
to = e.to;
|
||||||
}
|
}
|
||||||
|
|
||||||
RegExp(Kind kind, RegExp exp1, RegExp exp2, String s, int c, int min, int max, int digits, int from, int to){
|
RegExp(int flags, Kind kind, RegExp exp1, RegExp exp2, String s, int c, int min, int max, int digits, int from, int to){
|
||||||
this.originalString = null;
|
this.originalString = null;
|
||||||
this.kind = kind;
|
this.kind = kind;
|
||||||
this.flags = 0;
|
this.flags = flags;
|
||||||
this.exp1 = exp1;
|
this.exp1 = exp1;
|
||||||
this.exp2 = exp2;
|
this.exp2 = exp2;
|
||||||
this.s = s;
|
this.s = s;
|
||||||
|
@ -536,19 +565,19 @@ public class RegExp {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Simplified construction of container nodes
|
// Simplified construction of container nodes
|
||||||
static RegExp newContainerNode(Kind kind, RegExp exp1, RegExp exp2) {
|
static RegExp newContainerNode(int flags, Kind kind, RegExp exp1, RegExp exp2) {
|
||||||
return new RegExp(kind, exp1, exp2, null, 0, 0, 0, 0, 0, 0);
|
return new RegExp(flags, kind, exp1, exp2, null, 0, 0, 0, 0, 0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Simplified construction of repeating nodes
|
// Simplified construction of repeating nodes
|
||||||
static RegExp newRepeatingNode(Kind kind, RegExp exp, int min, int max) {
|
static RegExp newRepeatingNode(int flags, Kind kind, RegExp exp, int min, int max) {
|
||||||
return new RegExp(kind, exp, null, null, 0, min, max, 0, 0, 0);
|
return new RegExp(flags, kind, exp, null, null, 0, min, max, 0, 0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Simplified construction of leaf nodes
|
// Simplified construction of leaf nodes
|
||||||
static RegExp newLeafNode(Kind kind, String s, int c, int min, int max, int digits, int from, int to) {
|
static RegExp newLeafNode(int flags, Kind kind, String s, int c, int min, int max, int digits, int from, int to) {
|
||||||
return new RegExp(kind, null, null, s, c, min, max, digits, from, to);
|
return new RegExp(flags, kind, null, null, s, c, min, max, digits, from, to);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -703,7 +732,11 @@ public class RegExp {
|
||||||
a = MinimizationOperations.minimize(a, maxDeterminizedStates);
|
a = MinimizationOperations.minimize(a, maxDeterminizedStates);
|
||||||
break;
|
break;
|
||||||
case REGEXP_CHAR:
|
case REGEXP_CHAR:
|
||||||
|
if (check(ASCII_CASE_INSENSITIVE)) {
|
||||||
|
a = toCaseInsensitiveChar(c, maxDeterminizedStates);
|
||||||
|
} else {
|
||||||
a = Automata.makeChar(c);
|
a = Automata.makeChar(c);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case REGEXP_CHAR_RANGE:
|
case REGEXP_CHAR_RANGE:
|
||||||
a = Automata.makeCharRange(from, to);
|
a = Automata.makeCharRange(from, to);
|
||||||
|
@ -715,7 +748,11 @@ public class RegExp {
|
||||||
a = Automata.makeEmpty();
|
a = Automata.makeEmpty();
|
||||||
break;
|
break;
|
||||||
case REGEXP_STRING:
|
case REGEXP_STRING:
|
||||||
|
if (check(ASCII_CASE_INSENSITIVE)) {
|
||||||
|
a = toCaseInsensitiveString(maxDeterminizedStates);
|
||||||
|
} else {
|
||||||
a = Automata.makeString(s);
|
a = Automata.makeString(s);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case REGEXP_ANYSTRING:
|
case REGEXP_ANYSTRING:
|
||||||
a = Automata.makeAnyString();
|
a = Automata.makeAnyString();
|
||||||
|
@ -743,6 +780,34 @@ public class RegExp {
|
||||||
}
|
}
|
||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
|
private Automaton toCaseInsensitiveChar(int codepoint, int maxDeterminizedStates) {
|
||||||
|
Automaton case1 = Automata.makeChar(codepoint);
|
||||||
|
// For now we only work with ASCII characters
|
||||||
|
if (codepoint > 128) {
|
||||||
|
return case1;
|
||||||
|
}
|
||||||
|
int altCase = Character.isLowerCase(codepoint) ? Character.toUpperCase(codepoint) : Character.toLowerCase(codepoint);
|
||||||
|
Automaton result;
|
||||||
|
if (altCase != codepoint) {
|
||||||
|
result = Operations.union(case1, Automata.makeChar(altCase));
|
||||||
|
result = MinimizationOperations.minimize(result, maxDeterminizedStates);
|
||||||
|
} else {
|
||||||
|
result = case1;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Automaton toCaseInsensitiveString(int maxDeterminizedStates) {
|
||||||
|
List<Automaton> list = new ArrayList<>();
|
||||||
|
|
||||||
|
Iterator<Integer> iter = s.codePoints().iterator();
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
list.add(toCaseInsensitiveChar(iter.next(), maxDeterminizedStates));
|
||||||
|
}
|
||||||
|
Automaton a = Operations.concatenate(list);
|
||||||
|
a = MinimizationOperations.minimize(a, maxDeterminizedStates);
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
private void findLeaves(RegExp exp, Kind kind, List<Automaton> list,
|
private void findLeaves(RegExp exp, Kind kind, List<Automaton> list,
|
||||||
Map<String,Automaton> automata, AutomatonProvider automaton_provider,
|
Map<String,Automaton> automata, AutomatonProvider automaton_provider,
|
||||||
|
@ -1000,97 +1065,97 @@ public class RegExp {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static RegExp makeUnion(RegExp exp1, RegExp exp2) {
|
static RegExp makeUnion(int flags, RegExp exp1, RegExp exp2) {
|
||||||
return newContainerNode(Kind.REGEXP_UNION, exp1, exp2);
|
return newContainerNode(flags, Kind.REGEXP_UNION, exp1, exp2);
|
||||||
}
|
}
|
||||||
|
|
||||||
static RegExp makeConcatenation(RegExp exp1, RegExp exp2) {
|
static RegExp makeConcatenation(int flags, RegExp exp1, RegExp exp2) {
|
||||||
if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING)
|
if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING)
|
||||||
&& (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) return makeString(
|
&& (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) return makeString(
|
||||||
exp1, exp2);
|
flags, exp1, exp2);
|
||||||
RegExp rexp1, rexp2;
|
RegExp rexp1, rexp2;
|
||||||
if (exp1.kind == Kind.REGEXP_CONCATENATION
|
if (exp1.kind == Kind.REGEXP_CONCATENATION
|
||||||
&& (exp1.exp2.kind == Kind.REGEXP_CHAR || exp1.exp2.kind == Kind.REGEXP_STRING)
|
&& (exp1.exp2.kind == Kind.REGEXP_CHAR || exp1.exp2.kind == Kind.REGEXP_STRING)
|
||||||
&& (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) {
|
&& (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) {
|
||||||
rexp1 = exp1.exp1;
|
rexp1 = exp1.exp1;
|
||||||
rexp2 = makeString(exp1.exp2, exp2);
|
rexp2 = makeString(flags, exp1.exp2, exp2);
|
||||||
} else if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING)
|
} else if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING)
|
||||||
&& exp2.kind == Kind.REGEXP_CONCATENATION
|
&& exp2.kind == Kind.REGEXP_CONCATENATION
|
||||||
&& (exp2.exp1.kind == Kind.REGEXP_CHAR || exp2.exp1.kind == Kind.REGEXP_STRING)) {
|
&& (exp2.exp1.kind == Kind.REGEXP_CHAR || exp2.exp1.kind == Kind.REGEXP_STRING)) {
|
||||||
rexp1 = makeString(exp1, exp2.exp1);
|
rexp1 = makeString(flags, exp1, exp2.exp1);
|
||||||
rexp2 = exp2.exp2;
|
rexp2 = exp2.exp2;
|
||||||
} else {
|
} else {
|
||||||
rexp1 = exp1;
|
rexp1 = exp1;
|
||||||
rexp2 = exp2;
|
rexp2 = exp2;
|
||||||
}
|
}
|
||||||
return newContainerNode(Kind.REGEXP_CONCATENATION, rexp1, rexp2);
|
return newContainerNode(flags, Kind.REGEXP_CONCATENATION, rexp1, rexp2);
|
||||||
}
|
}
|
||||||
|
|
||||||
static private RegExp makeString(RegExp exp1, RegExp exp2) {
|
static private RegExp makeString(int flags, RegExp exp1, RegExp exp2) {
|
||||||
StringBuilder b = new StringBuilder();
|
StringBuilder b = new StringBuilder();
|
||||||
if (exp1.kind == Kind.REGEXP_STRING) b.append(exp1.s);
|
if (exp1.kind == Kind.REGEXP_STRING) b.append(exp1.s);
|
||||||
else b.appendCodePoint(exp1.c);
|
else b.appendCodePoint(exp1.c);
|
||||||
if (exp2.kind == Kind.REGEXP_STRING) b.append(exp2.s);
|
if (exp2.kind == Kind.REGEXP_STRING) b.append(exp2.s);
|
||||||
else b.appendCodePoint(exp2.c);
|
else b.appendCodePoint(exp2.c);
|
||||||
return makeString(b.toString());
|
return makeString(flags, b.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
static RegExp makeIntersection(RegExp exp1, RegExp exp2) {
|
static RegExp makeIntersection(int flags, RegExp exp1, RegExp exp2) {
|
||||||
return newContainerNode(Kind.REGEXP_INTERSECTION, exp1, exp2);
|
return newContainerNode(flags, Kind.REGEXP_INTERSECTION, exp1, exp2);
|
||||||
}
|
}
|
||||||
|
|
||||||
static RegExp makeOptional(RegExp exp) {
|
static RegExp makeOptional(int flags, RegExp exp) {
|
||||||
return newContainerNode(Kind.REGEXP_OPTIONAL, exp, null);
|
return newContainerNode(flags, Kind.REGEXP_OPTIONAL, exp, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
static RegExp makeRepeat(RegExp exp) {
|
static RegExp makeRepeat(int flags, RegExp exp) {
|
||||||
return newContainerNode(Kind.REGEXP_REPEAT, exp, null);
|
return newContainerNode(flags, Kind.REGEXP_REPEAT, exp, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
static RegExp makeRepeat(RegExp exp, int min) {
|
static RegExp makeRepeat(int flags, RegExp exp, int min) {
|
||||||
return newRepeatingNode(Kind.REGEXP_REPEAT_MIN, exp, min, 0);
|
return newRepeatingNode(flags, Kind.REGEXP_REPEAT_MIN, exp, min, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static RegExp makeRepeat(RegExp exp, int min, int max) {
|
static RegExp makeRepeat(int flags, RegExp exp, int min, int max) {
|
||||||
return newRepeatingNode(Kind.REGEXP_REPEAT_MINMAX, exp, min, max);
|
return newRepeatingNode(flags, Kind.REGEXP_REPEAT_MINMAX, exp, min, max);
|
||||||
}
|
}
|
||||||
|
|
||||||
static RegExp makeComplement(RegExp exp) {
|
static RegExp makeComplement(int flags, RegExp exp) {
|
||||||
return newContainerNode(Kind.REGEXP_COMPLEMENT, exp, null);
|
return newContainerNode(flags, Kind.REGEXP_COMPLEMENT, exp, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
static RegExp makeChar(int c) {
|
static RegExp makeChar(int flags, int c) {
|
||||||
return newLeafNode(Kind.REGEXP_CHAR, null, c, 0, 0, 0, 0, 0);
|
return newLeafNode(flags, Kind.REGEXP_CHAR, null, c, 0, 0, 0, 0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static RegExp makeCharRange(int from, int to) {
|
static RegExp makeCharRange(int flags, int from, int to) {
|
||||||
if (from > to)
|
if (from > to)
|
||||||
throw new IllegalArgumentException("invalid range: from (" + from + ") cannot be > to (" + to + ")");
|
throw new IllegalArgumentException("invalid range: from (" + from + ") cannot be > to (" + to + ")");
|
||||||
return newLeafNode(Kind.REGEXP_CHAR_RANGE, null, 0, 0, 0, 0, from, to);
|
return newLeafNode(flags, Kind.REGEXP_CHAR_RANGE, null, 0, 0, 0, 0, from, to);
|
||||||
}
|
}
|
||||||
|
|
||||||
static RegExp makeAnyChar() {
|
static RegExp makeAnyChar(int flags) {
|
||||||
return newContainerNode(Kind.REGEXP_ANYCHAR, null, null);
|
return newContainerNode(flags, Kind.REGEXP_ANYCHAR, null, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
static RegExp makeEmpty() {
|
static RegExp makeEmpty(int flags) {
|
||||||
return newContainerNode(Kind.REGEXP_EMPTY, null, null);
|
return newContainerNode(flags, Kind.REGEXP_EMPTY, null, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
static RegExp makeString(String s) {
|
static RegExp makeString(int flags, String s) {
|
||||||
return newLeafNode(Kind.REGEXP_STRING, s, 0, 0, 0, 0, 0, 0);
|
return newLeafNode(flags, Kind.REGEXP_STRING, s, 0, 0, 0, 0, 0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static RegExp makeAnyString() {
|
static RegExp makeAnyString(int flags) {
|
||||||
return newContainerNode(Kind.REGEXP_ANYSTRING, null, null);
|
return newContainerNode(flags, Kind.REGEXP_ANYSTRING, null, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
static RegExp makeAutomaton(String s) {
|
static RegExp makeAutomaton(int flags, String s) {
|
||||||
return newLeafNode(Kind.REGEXP_AUTOMATON, s, 0, 0, 0, 0, 0, 0);
|
return newLeafNode(flags, Kind.REGEXP_AUTOMATON, s, 0, 0, 0, 0, 0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static RegExp makeInterval(int min, int max, int digits) {
|
static RegExp makeInterval(int flags, int min, int max, int digits) {
|
||||||
return newLeafNode(Kind.REGEXP_INTERVAL, null, 0, min, max, digits, 0, 0);
|
return newLeafNode(flags, Kind.REGEXP_INTERVAL, null, 0, min, max, digits, 0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean peek(String s) {
|
private boolean peek(String s) {
|
||||||
|
@ -1123,13 +1188,13 @@ public class RegExp {
|
||||||
|
|
||||||
final RegExp parseUnionExp() throws IllegalArgumentException {
|
final RegExp parseUnionExp() throws IllegalArgumentException {
|
||||||
RegExp e = parseInterExp();
|
RegExp e = parseInterExp();
|
||||||
if (match('|')) e = makeUnion(e, parseUnionExp());
|
if (match('|')) e = makeUnion(flags, e, parseUnionExp());
|
||||||
return e;
|
return e;
|
||||||
}
|
}
|
||||||
|
|
||||||
final RegExp parseInterExp() throws IllegalArgumentException {
|
final RegExp parseInterExp() throws IllegalArgumentException {
|
||||||
RegExp e = parseConcatExp();
|
RegExp e = parseConcatExp();
|
||||||
if (check(INTERSECTION) && match('&')) e = makeIntersection(e,
|
if (check(INTERSECTION) && match('&')) e = makeIntersection(flags, e,
|
||||||
parseInterExp());
|
parseInterExp());
|
||||||
return e;
|
return e;
|
||||||
}
|
}
|
||||||
|
@ -1137,16 +1202,16 @@ public class RegExp {
|
||||||
final RegExp parseConcatExp() throws IllegalArgumentException {
|
final RegExp parseConcatExp() throws IllegalArgumentException {
|
||||||
RegExp e = parseRepeatExp();
|
RegExp e = parseRepeatExp();
|
||||||
if (more() && !peek(")|") && (!check(INTERSECTION) || !peek("&"))) e = makeConcatenation(
|
if (more() && !peek(")|") && (!check(INTERSECTION) || !peek("&"))) e = makeConcatenation(
|
||||||
e, parseConcatExp());
|
flags, e, parseConcatExp());
|
||||||
return e;
|
return e;
|
||||||
}
|
}
|
||||||
|
|
||||||
final RegExp parseRepeatExp() throws IllegalArgumentException {
|
final RegExp parseRepeatExp() throws IllegalArgumentException {
|
||||||
RegExp e = parseComplExp();
|
RegExp e = parseComplExp();
|
||||||
while (peek("?*+{")) {
|
while (peek("?*+{")) {
|
||||||
if (match('?')) e = makeOptional(e);
|
if (match('?')) e = makeOptional(flags, e);
|
||||||
else if (match('*')) e = makeRepeat(e);
|
else if (match('*')) e = makeRepeat(flags, e);
|
||||||
else if (match('+')) e = makeRepeat(e, 1);
|
else if (match('+')) e = makeRepeat(flags, e, 1);
|
||||||
else if (match('{')) {
|
else if (match('{')) {
|
||||||
int start = pos;
|
int start = pos;
|
||||||
while (peek("0123456789"))
|
while (peek("0123456789"))
|
||||||
|
@ -1164,15 +1229,15 @@ public class RegExp {
|
||||||
} else m = n;
|
} else m = n;
|
||||||
if (!match('}')) throw new IllegalArgumentException(
|
if (!match('}')) throw new IllegalArgumentException(
|
||||||
"expected '}' at position " + pos);
|
"expected '}' at position " + pos);
|
||||||
if (m == -1) e = makeRepeat(e, n);
|
if (m == -1) e = makeRepeat(flags, e, n);
|
||||||
else e = makeRepeat(e, n, m);
|
else e = makeRepeat(flags, e, n, m);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return e;
|
return e;
|
||||||
}
|
}
|
||||||
|
|
||||||
final RegExp parseComplExp() throws IllegalArgumentException {
|
final RegExp parseComplExp() throws IllegalArgumentException {
|
||||||
if (check(COMPLEMENT) && match('~')) return makeComplement(parseComplExp());
|
if (check(COMPLEMENT) && match('~')) return makeComplement(flags, parseComplExp());
|
||||||
else return parseCharClassExp();
|
else return parseCharClassExp();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1181,7 +1246,7 @@ public class RegExp {
|
||||||
boolean negate = false;
|
boolean negate = false;
|
||||||
if (match('^')) negate = true;
|
if (match('^')) negate = true;
|
||||||
RegExp e = parseCharClasses();
|
RegExp e = parseCharClasses();
|
||||||
if (negate) e = makeIntersection(makeAnyChar(), makeComplement(e));
|
if (negate) e = makeIntersection(flags, makeAnyChar(flags), makeComplement(flags, e));
|
||||||
if (!match(']')) throw new IllegalArgumentException(
|
if (!match(']')) throw new IllegalArgumentException(
|
||||||
"expected ']' at position " + pos);
|
"expected ']' at position " + pos);
|
||||||
return e;
|
return e;
|
||||||
|
@ -1191,7 +1256,7 @@ public class RegExp {
|
||||||
final RegExp parseCharClasses() throws IllegalArgumentException {
|
final RegExp parseCharClasses() throws IllegalArgumentException {
|
||||||
RegExp e = parseCharClass();
|
RegExp e = parseCharClass();
|
||||||
while (more() && !peek("]"))
|
while (more() && !peek("]"))
|
||||||
e = makeUnion(e, parseCharClass());
|
e = makeUnion(flags, e, parseCharClass());
|
||||||
return e;
|
return e;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1202,8 +1267,8 @@ public class RegExp {
|
||||||
}
|
}
|
||||||
|
|
||||||
int c = parseCharExp();
|
int c = parseCharExp();
|
||||||
if (match('-')) return makeCharRange(c, parseCharExp());
|
if (match('-')) return makeCharRange(flags, c, parseCharExp());
|
||||||
else return makeChar(c);
|
else return makeChar(flags, c);
|
||||||
}
|
}
|
||||||
|
|
||||||
RegExp expandPredefined() {
|
RegExp expandPredefined() {
|
||||||
|
@ -1232,11 +1297,11 @@ public class RegExp {
|
||||||
//See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
|
//See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
|
||||||
if (match('\\')) {
|
if (match('\\')) {
|
||||||
if (peek("dDwWsS")) {
|
if (peek("dDwWsS")) {
|
||||||
return newLeafNode(Kind.REGEXP_PRE_CLASS, null, 0, 0, 0, 0, next(), 0);
|
return newLeafNode(flags, Kind.REGEXP_PRE_CLASS, null, 0, 0, 0, 0, next(), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (peek("\\")) {
|
if (peek("\\")) {
|
||||||
return makeChar(next());
|
return makeChar(flags, next());
|
||||||
}
|
}
|
||||||
|
|
||||||
// From https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#bs
|
// From https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#bs
|
||||||
|
@ -1252,18 +1317,18 @@ public class RegExp {
|
||||||
|
|
||||||
|
|
||||||
final RegExp parseSimpleExp() throws IllegalArgumentException {
|
final RegExp parseSimpleExp() throws IllegalArgumentException {
|
||||||
if (match('.')) return makeAnyChar();
|
if (match('.')) return makeAnyChar(flags);
|
||||||
else if (check(EMPTY) && match('#')) return makeEmpty();
|
else if (check(EMPTY) && match('#')) return makeEmpty(flags);
|
||||||
else if (check(ANYSTRING) && match('@')) return makeAnyString();
|
else if (check(ANYSTRING) && match('@')) return makeAnyString(flags);
|
||||||
else if (match('"')) {
|
else if (match('"')) {
|
||||||
int start = pos;
|
int start = pos;
|
||||||
while (more() && !peek("\""))
|
while (more() && !peek("\""))
|
||||||
next();
|
next();
|
||||||
if (!match('"')) throw new IllegalArgumentException(
|
if (!match('"')) throw new IllegalArgumentException(
|
||||||
"expected '\"' at position " + pos);
|
"expected '\"' at position " + pos);
|
||||||
return makeString(originalString.substring(start, pos - 1));
|
return makeString(flags, originalString.substring(start, pos - 1));
|
||||||
} else if (match('(')) {
|
} else if (match('(')) {
|
||||||
if (match(')')) return makeString("");
|
if (match(')')) return makeString(flags, "");
|
||||||
RegExp e = parseUnionExp();
|
RegExp e = parseUnionExp();
|
||||||
if (!match(')')) throw new IllegalArgumentException(
|
if (!match(')')) throw new IllegalArgumentException(
|
||||||
"expected ')' at position " + pos);
|
"expected ')' at position " + pos);
|
||||||
|
@ -1279,7 +1344,7 @@ public class RegExp {
|
||||||
if (i == -1) {
|
if (i == -1) {
|
||||||
if (!check(AUTOMATON)) throw new IllegalArgumentException(
|
if (!check(AUTOMATON)) throw new IllegalArgumentException(
|
||||||
"interval syntax error at position " + (pos - 1));
|
"interval syntax error at position " + (pos - 1));
|
||||||
return makeAutomaton(s);
|
return makeAutomaton(flags, s);
|
||||||
} else {
|
} else {
|
||||||
if (!check(INTERVAL)) throw new IllegalArgumentException(
|
if (!check(INTERVAL)) throw new IllegalArgumentException(
|
||||||
"illegal identifier at position " + (pos - 1));
|
"illegal identifier at position " + (pos - 1));
|
||||||
|
@ -1297,7 +1362,7 @@ public class RegExp {
|
||||||
imin = imax;
|
imin = imax;
|
||||||
imax = t;
|
imax = t;
|
||||||
}
|
}
|
||||||
return makeInterval(imin, imax, digits);
|
return makeInterval(flags, imin, imax, digits);
|
||||||
} catch (NumberFormatException e) {
|
} catch (NumberFormatException e) {
|
||||||
throw new IllegalArgumentException(
|
throw new IllegalArgumentException(
|
||||||
"interval syntax error at position " + (pos - 1));
|
"interval syntax error at position " + (pos - 1));
|
||||||
|
@ -1308,7 +1373,7 @@ public class RegExp {
|
||||||
if (predefined != null) {
|
if (predefined != null) {
|
||||||
return predefined;
|
return predefined;
|
||||||
}
|
}
|
||||||
return makeChar(parseCharExp());
|
return makeChar(flags, parseCharExp());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -73,6 +73,12 @@ public class TestRegexpQuery extends LuceneTestCase {
|
||||||
return searcher.count(query);
|
return searcher.count(query);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private long caseInsensitiveRegexQueryNrHits(String regex) throws IOException {
|
||||||
|
RegexpQuery query = new RegexpQuery(newTerm(regex), RegExp.ALL, RegExp.ASCII_CASE_INSENSITIVE,
|
||||||
|
Operations.DEFAULT_MAX_DETERMINIZED_STATES);
|
||||||
|
return searcher.count(query);
|
||||||
|
}
|
||||||
|
|
||||||
public void testRegex1() throws IOException {
|
public void testRegex1() throws IOException {
|
||||||
assertEquals(1, regexQueryNrHits("q.[aeiou]c.*"));
|
assertEquals(1, regexQueryNrHits("q.[aeiou]c.*"));
|
||||||
}
|
}
|
||||||
|
@ -125,6 +131,11 @@ public class TestRegexpQuery extends LuceneTestCase {
|
||||||
assertTrue(expected.getMessage().contains("invalid character class"));
|
assertTrue(expected.getMessage().contains("invalid character class"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testCaseInsensitive() throws IOException {
|
||||||
|
assertEquals(0, regexQueryNrHits("Quick"));
|
||||||
|
assertEquals(1, caseInsensitiveRegexQueryNrHits("Quick"));
|
||||||
|
}
|
||||||
|
|
||||||
public void testRegexComplement() throws IOException {
|
public void testRegexComplement() throws IOException {
|
||||||
assertEquals(1, regexQueryNrHits("4934~[3]"));
|
assertEquals(1, regexQueryNrHits("4934~[3]"));
|
||||||
// not the empty lang, i.e. match all docs
|
// not the empty lang, i.e. match all docs
|
||||||
|
|
|
@ -88,10 +88,14 @@ public class TestRegExp extends LuceneTestCase {
|
||||||
assertTrue(a.toString().length() > 0);
|
assertTrue(a.toString().length() > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
boolean caseSensitiveQuery = true;
|
||||||
|
|
||||||
public void testCoreJavaParity() {
|
public void testCoreJavaParity() {
|
||||||
// Generate random doc values and random regular expressions
|
// Generate random doc values and random regular expressions
|
||||||
// and check for same matching behaviour as Java's Pattern class.
|
// and check for same matching behaviour as Java's Pattern class.
|
||||||
for (int i = 0; i < 1000; i++) {
|
for (int i = 0; i < 1000; i++) {
|
||||||
|
caseSensitiveQuery = true;
|
||||||
checkRandomExpression(randomDocValue(1 + random().nextInt(30)));
|
checkRandomExpression(randomDocValue(1 + random().nextInt(30)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -144,7 +148,7 @@ public class TestRegExp extends LuceneTestCase {
|
||||||
|
|
||||||
// Modify the middle...
|
// Modify the middle...
|
||||||
String replacementPart = docValue.substring(substitutionPoint, substitutionPoint + substitutionLength);
|
String replacementPart = docValue.substring(substitutionPoint, substitutionPoint + substitutionLength);
|
||||||
int mutation = random().nextInt(13);
|
int mutation = random().nextInt(15);
|
||||||
switch (mutation) {
|
switch (mutation) {
|
||||||
case 0:
|
case 0:
|
||||||
// OR with random alpha of same length
|
// OR with random alpha of same length
|
||||||
|
@ -205,6 +209,25 @@ public class TestRegExp extends LuceneTestCase {
|
||||||
// Make any whitespace chars replace by whitespace class
|
// Make any whitespace chars replace by whitespace class
|
||||||
result.append(replacementPart.replaceAll("\\s", "\\\\s"));
|
result.append(replacementPart.replaceAll("\\s", "\\\\s"));
|
||||||
break;
|
break;
|
||||||
|
case 14:
|
||||||
|
// Switch case of characters
|
||||||
|
StringBuilder switchedCase = new StringBuilder();
|
||||||
|
replacementPart.codePoints().forEach(
|
||||||
|
p -> {
|
||||||
|
int switchedP = p;
|
||||||
|
if (Character.isLowerCase(p)) {
|
||||||
|
switchedP = Character.toUpperCase(p);
|
||||||
|
} else {
|
||||||
|
switchedP = Character.toLowerCase(p);
|
||||||
|
}
|
||||||
|
switchedCase.appendCodePoint(switchedP);
|
||||||
|
if (p != switchedP) {
|
||||||
|
caseSensitiveQuery = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
);
|
||||||
|
result.append(switchedCase.toString());
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -215,11 +238,14 @@ public class TestRegExp extends LuceneTestCase {
|
||||||
|
|
||||||
String regexPattern = result.toString();
|
String regexPattern = result.toString();
|
||||||
// Assert our randomly generated regex actually matches the provided raw input using java's expression matcher
|
// Assert our randomly generated regex actually matches the provided raw input using java's expression matcher
|
||||||
Pattern pattern = Pattern.compile(regexPattern);
|
Pattern pattern = caseSensitiveQuery ? Pattern.compile(regexPattern):
|
||||||
|
Pattern.compile(regexPattern, Pattern.CASE_INSENSITIVE);
|
||||||
|
;
|
||||||
Matcher matcher = pattern.matcher(docValue);
|
Matcher matcher = pattern.matcher(docValue);
|
||||||
assertTrue("Java regex " + regexPattern + " did not match doc value " + docValue, matcher.matches());
|
assertTrue("Java regex " + regexPattern + " did not match doc value " + docValue, matcher.matches());
|
||||||
|
|
||||||
RegExp regex = new RegExp(regexPattern);
|
int matchFlags = caseSensitiveQuery ? 0 : RegExp.ASCII_CASE_INSENSITIVE;
|
||||||
|
RegExp regex = new RegExp(regexPattern, RegExp.ALL, matchFlags);
|
||||||
Automaton automaton = regex.toAutomaton();
|
Automaton automaton = regex.toAutomaton();
|
||||||
ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton);
|
ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton);
|
||||||
BytesRef br = new BytesRef(docValue);
|
BytesRef br = new BytesRef(docValue);
|
||||||
|
@ -228,6 +254,16 @@ public class TestRegExp extends LuceneTestCase {
|
||||||
+ docValue.length(),
|
+ docValue.length(),
|
||||||
bytesMatcher.run(br.bytes, br.offset, br.length)
|
bytesMatcher.run(br.bytes, br.offset, br.length)
|
||||||
);
|
);
|
||||||
|
if (caseSensitiveQuery == false) {
|
||||||
|
RegExp caseSensitiveRegex = new RegExp(regexPattern);
|
||||||
|
Automaton csAutomaton = caseSensitiveRegex.toAutomaton();
|
||||||
|
ByteRunAutomaton csBytesMatcher = new ByteRunAutomaton(csAutomaton);
|
||||||
|
assertFalse(
|
||||||
|
"[" + regexPattern + "] with case sensitive setting should not match [" + docValue + "]",
|
||||||
|
csBytesMatcher.run(br.bytes, br.offset, br.length)
|
||||||
|
);
|
||||||
|
|
||||||
|
}
|
||||||
return regexPattern;
|
return regexPattern;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue