diff --git a/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java b/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java
index b3c8ad2e2d5..b241ac5f873 100644
--- a/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java
@@ -83,7 +83,7 @@ public class RegexpQuery extends AutomatonQuery {
* Constructs a query for terms matching term
.
*
* @param term regular expression.
- * @param flags optional RegExp features from {@link RegExp}
+ * @param flags optional RegExp syntax features from {@link RegExp}
* @param maxDeterminizedStates maximum number of states that compiling the
* automaton for the regexp can result in. Set higher to allow more complex
* queries and lower to prevent memory exhaustion.
@@ -96,16 +96,46 @@ public class RegexpQuery extends AutomatonQuery {
* Constructs a query for terms matching term
.
*
* @param term regular expression.
- * @param flags optional RegExp features from {@link RegExp}
+ * @param syntax_flags optional RegExp syntax features from {@link RegExp}
+ * automaton for the regexp can result in. Set higher to allow more complex
+ * queries and lower to prevent memory exhaustion.
+ * @param match_flags boolean 'or' of match behavior options such as case insensitivity
+ * @param maxDeterminizedStates maximum number of states that compiling the
+ */
+ public RegexpQuery(Term term, int syntax_flags, int match_flags, int maxDeterminizedStates) {
+ this(term, syntax_flags, match_flags, defaultProvider, maxDeterminizedStates);
+ }
+
+ /**
+ * Constructs a query for terms matching term
.
+ *
+ * @param term regular expression.
+ * @param syntax_flags optional RegExp features from {@link RegExp}
* @param provider custom AutomatonProvider for named automata
* @param maxDeterminizedStates maximum number of states that compiling the
* automaton for the regexp can result in. Set higher to allow more complex
* queries and lower to prevent memory exhaustion.
*/
- public RegexpQuery(Term term, int flags, AutomatonProvider provider,
+ public RegexpQuery(Term term, int syntax_flags, AutomatonProvider provider,
+ int maxDeterminizedStates) {
+ this(term, syntax_flags, 0, provider, maxDeterminizedStates);
+ }
+
+ /**
+ * Constructs a query for terms matching term
.
+ *
+ * @param term regular expression.
+ * @param syntax_flags optional RegExp features from {@link RegExp}
+ * @param match_flags boolean 'or' of match behavior options such as case insensitivity
+ * @param provider custom AutomatonProvider for named automata
+ * @param maxDeterminizedStates maximum number of states that compiling the
+ * automaton for the regexp can result in. Set higher to allow more complex
+ * queries and lower to prevent memory exhaustion.
+ */
+ public RegexpQuery(Term term, int syntax_flags, int match_flags, AutomatonProvider provider,
int maxDeterminizedStates) {
super(term,
- new RegExp(term.text(), flags).toAutomaton(
+ new RegExp(term.text(), syntax_flags, match_flags).toAutomaton(
provider, maxDeterminizedStates), maxDeterminizedStates);
}
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
index 0874cde7101..59cf2c41e00 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
@@ -32,6 +32,7 @@ package org.apache.lucene.util.automaton;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
+import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -405,6 +406,7 @@ public class RegExp {
REGEXP_PRE_CLASS
}
+ //----- Syntax flags ( <= 0xff ) ------
/**
* Syntax flag, enables intersection (&
).
*/
@@ -439,12 +441,19 @@ public class RegExp {
/**
* Syntax flag, enables all optional regexp syntax.
*/
- public static final int ALL = 0xffff;
-
+ public static final int ALL = 0xff;
+
/**
* Syntax flag, enables no optional regexp syntax.
*/
public static final int NONE = 0x0000;
+
+ //----- Matching flags ( > 0xff ) ------
+
+ /**
+ * Allows case insensitive matching of ASCII characters.
+ */
+ public static final int ASCII_CASE_INSENSITIVE = 0x0100;
//Immutable parsed state
/**
@@ -474,7 +483,7 @@ public class RegExp {
// Parser variables
private final String originalString;
- int flags;
+ final int flags;
int pos;
/**
@@ -499,10 +508,30 @@ public class RegExp {
* regular expression
*/
public RegExp(String s, int syntax_flags) throws IllegalArgumentException {
+ this(s, syntax_flags, 0);
+ }
+ /**
+ * Constructs new RegExp
from a string.
+ *
+ * @param s regexp string
+ * @param syntax_flags boolean 'or' of optional syntax constructs to be
+ * enabled
+ * @param match_flags boolean 'or' of match behavior options such as case insensitivity
+ * @exception IllegalArgumentException if an error occurred while parsing the
+ * regular expression
+ */
+ public RegExp(String s, int syntax_flags, int match_flags) throws IllegalArgumentException {
+ if (syntax_flags > ALL) {
+ throw new IllegalArgumentException("Illegal syntax flag");
+ }
+
+ if (match_flags > 0 && match_flags <= ALL) {
+ throw new IllegalArgumentException("Illegal match flag");
+ }
+ flags = syntax_flags | match_flags;
originalString = s;
- flags = syntax_flags;
RegExp e;
- if (s.length() == 0) e = makeString("");
+ if (s.length() == 0) e = makeString(flags, "");
else {
e = parseUnionExp();
if (pos < originalString.length()) throw new IllegalArgumentException(
@@ -520,10 +549,10 @@ public class RegExp {
to = e.to;
}
- RegExp(Kind kind, RegExp exp1, RegExp exp2, String s, int c, int min, int max, int digits, int from, int to){
+ RegExp(int flags, Kind kind, RegExp exp1, RegExp exp2, String s, int c, int min, int max, int digits, int from, int to){
this.originalString = null;
this.kind = kind;
- this.flags = 0;
+ this.flags = flags;
this.exp1 = exp1;
this.exp2 = exp2;
this.s = s;
@@ -536,19 +565,19 @@ public class RegExp {
}
// Simplified construction of container nodes
- static RegExp newContainerNode(Kind kind, RegExp exp1, RegExp exp2) {
- return new RegExp(kind, exp1, exp2, null, 0, 0, 0, 0, 0, 0);
+ static RegExp newContainerNode(int flags, Kind kind, RegExp exp1, RegExp exp2) {
+ return new RegExp(flags, kind, exp1, exp2, null, 0, 0, 0, 0, 0, 0);
}
// Simplified construction of repeating nodes
- static RegExp newRepeatingNode(Kind kind, RegExp exp, int min, int max) {
- return new RegExp(kind, exp, null, null, 0, min, max, 0, 0, 0);
+ static RegExp newRepeatingNode(int flags, Kind kind, RegExp exp, int min, int max) {
+ return new RegExp(flags, kind, exp, null, null, 0, min, max, 0, 0, 0);
}
// Simplified construction of leaf nodes
- static RegExp newLeafNode(Kind kind, String s, int c, int min, int max, int digits, int from, int to) {
- return new RegExp(kind, null, null, s, c, min, max, digits, from, to);
+ static RegExp newLeafNode(int flags, Kind kind, String s, int c, int min, int max, int digits, int from, int to) {
+ return new RegExp(flags, kind, null, null, s, c, min, max, digits, from, to);
}
/**
@@ -703,7 +732,11 @@ public class RegExp {
a = MinimizationOperations.minimize(a, maxDeterminizedStates);
break;
case REGEXP_CHAR:
- a = Automata.makeChar(c);
+ if (check(ASCII_CASE_INSENSITIVE)) {
+ a = toCaseInsensitiveChar(c, maxDeterminizedStates);
+ } else {
+ a = Automata.makeChar(c);
+ }
break;
case REGEXP_CHAR_RANGE:
a = Automata.makeCharRange(from, to);
@@ -715,7 +748,11 @@ public class RegExp {
a = Automata.makeEmpty();
break;
case REGEXP_STRING:
- a = Automata.makeString(s);
+ if (check(ASCII_CASE_INSENSITIVE)) {
+ a = toCaseInsensitiveString(maxDeterminizedStates);
+ } else {
+ a = Automata.makeString(s);
+ }
break;
case REGEXP_ANYSTRING:
a = Automata.makeAnyString();
@@ -743,6 +780,34 @@ public class RegExp {
}
return a;
}
+ private Automaton toCaseInsensitiveChar(int codepoint, int maxDeterminizedStates) {
+ Automaton case1 = Automata.makeChar(codepoint);
+ // For now we only work with ASCII characters
+ if (codepoint > 128) {
+ return case1;
+ }
+ int altCase = Character.isLowerCase(codepoint) ? Character.toUpperCase(codepoint) : Character.toLowerCase(codepoint);
+ Automaton result;
+ if (altCase != codepoint) {
+ result = Operations.union(case1, Automata.makeChar(altCase));
+ result = MinimizationOperations.minimize(result, maxDeterminizedStates);
+ } else {
+ result = case1;
+ }
+ return result;
+ }
+
+ private Automaton toCaseInsensitiveString(int maxDeterminizedStates) {
+ List list = new ArrayList<>();
+
+ Iterator iter = s.codePoints().iterator();
+ while (iter.hasNext()) {
+ list.add(toCaseInsensitiveChar(iter.next(), maxDeterminizedStates));
+ }
+ Automaton a = Operations.concatenate(list);
+ a = MinimizationOperations.minimize(a, maxDeterminizedStates);
+ return a;
+ }
private void findLeaves(RegExp exp, Kind kind, List list,
Map automata, AutomatonProvider automaton_provider,
@@ -1000,97 +1065,97 @@ public class RegExp {
}
}
- static RegExp makeUnion(RegExp exp1, RegExp exp2) {
- return newContainerNode(Kind.REGEXP_UNION, exp1, exp2);
+ static RegExp makeUnion(int flags, RegExp exp1, RegExp exp2) {
+ return newContainerNode(flags, Kind.REGEXP_UNION, exp1, exp2);
}
- static RegExp makeConcatenation(RegExp exp1, RegExp exp2) {
+ static RegExp makeConcatenation(int flags, RegExp exp1, RegExp exp2) {
if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING)
&& (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) return makeString(
- exp1, exp2);
+ flags, exp1, exp2);
RegExp rexp1, rexp2;
if (exp1.kind == Kind.REGEXP_CONCATENATION
&& (exp1.exp2.kind == Kind.REGEXP_CHAR || exp1.exp2.kind == Kind.REGEXP_STRING)
&& (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) {
rexp1 = exp1.exp1;
- rexp2 = makeString(exp1.exp2, exp2);
+ rexp2 = makeString(flags, exp1.exp2, exp2);
} else if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING)
&& exp2.kind == Kind.REGEXP_CONCATENATION
&& (exp2.exp1.kind == Kind.REGEXP_CHAR || exp2.exp1.kind == Kind.REGEXP_STRING)) {
- rexp1 = makeString(exp1, exp2.exp1);
+ rexp1 = makeString(flags, exp1, exp2.exp1);
rexp2 = exp2.exp2;
} else {
rexp1 = exp1;
rexp2 = exp2;
}
- return newContainerNode(Kind.REGEXP_CONCATENATION, rexp1, rexp2);
+ return newContainerNode(flags, Kind.REGEXP_CONCATENATION, rexp1, rexp2);
}
- static private RegExp makeString(RegExp exp1, RegExp exp2) {
+ static private RegExp makeString(int flags, RegExp exp1, RegExp exp2) {
StringBuilder b = new StringBuilder();
if (exp1.kind == Kind.REGEXP_STRING) b.append(exp1.s);
else b.appendCodePoint(exp1.c);
if (exp2.kind == Kind.REGEXP_STRING) b.append(exp2.s);
else b.appendCodePoint(exp2.c);
- return makeString(b.toString());
+ return makeString(flags, b.toString());
}
- static RegExp makeIntersection(RegExp exp1, RegExp exp2) {
- return newContainerNode(Kind.REGEXP_INTERSECTION, exp1, exp2);
+ static RegExp makeIntersection(int flags, RegExp exp1, RegExp exp2) {
+ return newContainerNode(flags, Kind.REGEXP_INTERSECTION, exp1, exp2);
}
- static RegExp makeOptional(RegExp exp) {
- return newContainerNode(Kind.REGEXP_OPTIONAL, exp, null);
+ static RegExp makeOptional(int flags, RegExp exp) {
+ return newContainerNode(flags, Kind.REGEXP_OPTIONAL, exp, null);
}
- static RegExp makeRepeat(RegExp exp) {
- return newContainerNode(Kind.REGEXP_REPEAT, exp, null);
+ static RegExp makeRepeat(int flags, RegExp exp) {
+ return newContainerNode(flags, Kind.REGEXP_REPEAT, exp, null);
}
- static RegExp makeRepeat(RegExp exp, int min) {
- return newRepeatingNode(Kind.REGEXP_REPEAT_MIN, exp, min, 0);
+ static RegExp makeRepeat(int flags, RegExp exp, int min) {
+ return newRepeatingNode(flags, Kind.REGEXP_REPEAT_MIN, exp, min, 0);
}
- static RegExp makeRepeat(RegExp exp, int min, int max) {
- return newRepeatingNode(Kind.REGEXP_REPEAT_MINMAX, exp, min, max);
+ static RegExp makeRepeat(int flags, RegExp exp, int min, int max) {
+ return newRepeatingNode(flags, Kind.REGEXP_REPEAT_MINMAX, exp, min, max);
}
- static RegExp makeComplement(RegExp exp) {
- return newContainerNode(Kind.REGEXP_COMPLEMENT, exp, null);
+ static RegExp makeComplement(int flags, RegExp exp) {
+ return newContainerNode(flags, Kind.REGEXP_COMPLEMENT, exp, null);
}
- static RegExp makeChar(int c) {
- return newLeafNode(Kind.REGEXP_CHAR, null, c, 0, 0, 0, 0, 0);
+ static RegExp makeChar(int flags, int c) {
+ return newLeafNode(flags, Kind.REGEXP_CHAR, null, c, 0, 0, 0, 0, 0);
}
- static RegExp makeCharRange(int from, int to) {
+ static RegExp makeCharRange(int flags, int from, int to) {
if (from > to)
throw new IllegalArgumentException("invalid range: from (" + from + ") cannot be > to (" + to + ")");
- return newLeafNode(Kind.REGEXP_CHAR_RANGE, null, 0, 0, 0, 0, from, to);
+ return newLeafNode(flags, Kind.REGEXP_CHAR_RANGE, null, 0, 0, 0, 0, from, to);
}
- static RegExp makeAnyChar() {
- return newContainerNode(Kind.REGEXP_ANYCHAR, null, null);
+ static RegExp makeAnyChar(int flags) {
+ return newContainerNode(flags, Kind.REGEXP_ANYCHAR, null, null);
}
- static RegExp makeEmpty() {
- return newContainerNode(Kind.REGEXP_EMPTY, null, null);
+ static RegExp makeEmpty(int flags) {
+ return newContainerNode(flags, Kind.REGEXP_EMPTY, null, null);
}
- static RegExp makeString(String s) {
- return newLeafNode(Kind.REGEXP_STRING, s, 0, 0, 0, 0, 0, 0);
+ static RegExp makeString(int flags, String s) {
+ return newLeafNode(flags, Kind.REGEXP_STRING, s, 0, 0, 0, 0, 0, 0);
}
- static RegExp makeAnyString() {
- return newContainerNode(Kind.REGEXP_ANYSTRING, null, null);
+ static RegExp makeAnyString(int flags) {
+ return newContainerNode(flags, Kind.REGEXP_ANYSTRING, null, null);
}
- static RegExp makeAutomaton(String s) {
- return newLeafNode(Kind.REGEXP_AUTOMATON, s, 0, 0, 0, 0, 0, 0);
+ static RegExp makeAutomaton(int flags, String s) {
+ return newLeafNode(flags, Kind.REGEXP_AUTOMATON, s, 0, 0, 0, 0, 0, 0);
}
- static RegExp makeInterval(int min, int max, int digits) {
- return newLeafNode(Kind.REGEXP_INTERVAL, null, 0, min, max, digits, 0, 0);
+ static RegExp makeInterval(int flags, int min, int max, int digits) {
+ return newLeafNode(flags, Kind.REGEXP_INTERVAL, null, 0, min, max, digits, 0, 0);
}
private boolean peek(String s) {
@@ -1123,13 +1188,13 @@ public class RegExp {
final RegExp parseUnionExp() throws IllegalArgumentException {
RegExp e = parseInterExp();
- if (match('|')) e = makeUnion(e, parseUnionExp());
+ if (match('|')) e = makeUnion(flags, e, parseUnionExp());
return e;
}
final RegExp parseInterExp() throws IllegalArgumentException {
RegExp e = parseConcatExp();
- if (check(INTERSECTION) && match('&')) e = makeIntersection(e,
+ if (check(INTERSECTION) && match('&')) e = makeIntersection(flags, e,
parseInterExp());
return e;
}
@@ -1137,16 +1202,16 @@ public class RegExp {
final RegExp parseConcatExp() throws IllegalArgumentException {
RegExp e = parseRepeatExp();
if (more() && !peek(")|") && (!check(INTERSECTION) || !peek("&"))) e = makeConcatenation(
- e, parseConcatExp());
+ flags, e, parseConcatExp());
return e;
}
final RegExp parseRepeatExp() throws IllegalArgumentException {
RegExp e = parseComplExp();
while (peek("?*+{")) {
- if (match('?')) e = makeOptional(e);
- else if (match('*')) e = makeRepeat(e);
- else if (match('+')) e = makeRepeat(e, 1);
+ if (match('?')) e = makeOptional(flags, e);
+ else if (match('*')) e = makeRepeat(flags, e);
+ else if (match('+')) e = makeRepeat(flags, e, 1);
else if (match('{')) {
int start = pos;
while (peek("0123456789"))
@@ -1164,15 +1229,15 @@ public class RegExp {
} else m = n;
if (!match('}')) throw new IllegalArgumentException(
"expected '}' at position " + pos);
- if (m == -1) e = makeRepeat(e, n);
- else e = makeRepeat(e, n, m);
+ if (m == -1) e = makeRepeat(flags, e, n);
+ else e = makeRepeat(flags, e, n, m);
}
}
return e;
}
final RegExp parseComplExp() throws IllegalArgumentException {
- if (check(COMPLEMENT) && match('~')) return makeComplement(parseComplExp());
+ if (check(COMPLEMENT) && match('~')) return makeComplement(flags, parseComplExp());
else return parseCharClassExp();
}
@@ -1181,7 +1246,7 @@ public class RegExp {
boolean negate = false;
if (match('^')) negate = true;
RegExp e = parseCharClasses();
- if (negate) e = makeIntersection(makeAnyChar(), makeComplement(e));
+ if (negate) e = makeIntersection(flags, makeAnyChar(flags), makeComplement(flags, e));
if (!match(']')) throw new IllegalArgumentException(
"expected ']' at position " + pos);
return e;
@@ -1191,7 +1256,7 @@ public class RegExp {
final RegExp parseCharClasses() throws IllegalArgumentException {
RegExp e = parseCharClass();
while (more() && !peek("]"))
- e = makeUnion(e, parseCharClass());
+ e = makeUnion(flags, e, parseCharClass());
return e;
}
@@ -1202,8 +1267,8 @@ public class RegExp {
}
int c = parseCharExp();
- if (match('-')) return makeCharRange(c, parseCharExp());
- else return makeChar(c);
+ if (match('-')) return makeCharRange(flags, c, parseCharExp());
+ else return makeChar(flags, c);
}
RegExp expandPredefined() {
@@ -1232,11 +1297,11 @@ public class RegExp {
//See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
if (match('\\')) {
if (peek("dDwWsS")) {
- return newLeafNode(Kind.REGEXP_PRE_CLASS, null, 0, 0, 0, 0, next(), 0);
+ return newLeafNode(flags, Kind.REGEXP_PRE_CLASS, null, 0, 0, 0, 0, next(), 0);
}
if (peek("\\")) {
- return makeChar(next());
+ return makeChar(flags, next());
}
// From https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#bs
@@ -1252,18 +1317,18 @@ public class RegExp {
final RegExp parseSimpleExp() throws IllegalArgumentException {
- if (match('.')) return makeAnyChar();
- else if (check(EMPTY) && match('#')) return makeEmpty();
- else if (check(ANYSTRING) && match('@')) return makeAnyString();
+ if (match('.')) return makeAnyChar(flags);
+ else if (check(EMPTY) && match('#')) return makeEmpty(flags);
+ else if (check(ANYSTRING) && match('@')) return makeAnyString(flags);
else if (match('"')) {
int start = pos;
while (more() && !peek("\""))
next();
if (!match('"')) throw new IllegalArgumentException(
"expected '\"' at position " + pos);
- return makeString(originalString.substring(start, pos - 1));
+ return makeString(flags, originalString.substring(start, pos - 1));
} else if (match('(')) {
- if (match(')')) return makeString("");
+ if (match(')')) return makeString(flags, "");
RegExp e = parseUnionExp();
if (!match(')')) throw new IllegalArgumentException(
"expected ')' at position " + pos);
@@ -1279,7 +1344,7 @@ public class RegExp {
if (i == -1) {
if (!check(AUTOMATON)) throw new IllegalArgumentException(
"interval syntax error at position " + (pos - 1));
- return makeAutomaton(s);
+ return makeAutomaton(flags, s);
} else {
if (!check(INTERVAL)) throw new IllegalArgumentException(
"illegal identifier at position " + (pos - 1));
@@ -1297,7 +1362,7 @@ public class RegExp {
imin = imax;
imax = t;
}
- return makeInterval(imin, imax, digits);
+ return makeInterval(flags, imin, imax, digits);
} catch (NumberFormatException e) {
throw new IllegalArgumentException(
"interval syntax error at position " + (pos - 1));
@@ -1308,7 +1373,7 @@ public class RegExp {
if (predefined != null) {
return predefined;
}
- return makeChar(parseCharExp());
+ return makeChar(flags, parseCharExp());
}
}
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java
index 767ee20a7d4..1db75ac4fe6 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java
@@ -73,6 +73,12 @@ public class TestRegexpQuery extends LuceneTestCase {
return searcher.count(query);
}
+ private long caseInsensitiveRegexQueryNrHits(String regex) throws IOException {
+ RegexpQuery query = new RegexpQuery(newTerm(regex), RegExp.ALL, RegExp.ASCII_CASE_INSENSITIVE,
+ Operations.DEFAULT_MAX_DETERMINIZED_STATES);
+ return searcher.count(query);
+ }
+
public void testRegex1() throws IOException {
assertEquals(1, regexQueryNrHits("q.[aeiou]c.*"));
}
@@ -125,6 +131,11 @@ public class TestRegexpQuery extends LuceneTestCase {
assertTrue(expected.getMessage().contains("invalid character class"));
}
+ public void testCaseInsensitive() throws IOException {
+ assertEquals(0, regexQueryNrHits("Quick"));
+ assertEquals(1, caseInsensitiveRegexQueryNrHits("Quick"));
+ }
+
public void testRegexComplement() throws IOException {
assertEquals(1, regexQueryNrHits("4934~[3]"));
// not the empty lang, i.e. match all docs
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java
index ce36eacfb10..26a76fb9a18 100644
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java
@@ -88,10 +88,14 @@ public class TestRegExp extends LuceneTestCase {
assertTrue(a.toString().length() > 0);
}
+
+ boolean caseSensitiveQuery = true;
+
public void testCoreJavaParity() {
// Generate random doc values and random regular expressions
// and check for same matching behaviour as Java's Pattern class.
for (int i = 0; i < 1000; i++) {
+ caseSensitiveQuery = true;
checkRandomExpression(randomDocValue(1 + random().nextInt(30)));
}
}
@@ -144,7 +148,7 @@ public class TestRegExp extends LuceneTestCase {
// Modify the middle...
String replacementPart = docValue.substring(substitutionPoint, substitutionPoint + substitutionLength);
- int mutation = random().nextInt(13);
+ int mutation = random().nextInt(15);
switch (mutation) {
case 0:
// OR with random alpha of same length
@@ -205,6 +209,25 @@ public class TestRegExp extends LuceneTestCase {
// Make any whitespace chars replace by whitespace class
result.append(replacementPart.replaceAll("\\s", "\\\\s"));
break;
+ case 14:
+ // Switch case of characters
+ StringBuilder switchedCase = new StringBuilder();
+ replacementPart.codePoints().forEach(
+ p -> {
+ int switchedP = p;
+ if (Character.isLowerCase(p)) {
+ switchedP = Character.toUpperCase(p);
+ } else {
+ switchedP = Character.toLowerCase(p);
+ }
+ switchedCase.appendCodePoint(switchedP);
+ if (p != switchedP) {
+ caseSensitiveQuery = false;
+ }
+ }
+ );
+ result.append(switchedCase.toString());
+ break;
default:
break;
}
@@ -215,11 +238,14 @@ public class TestRegExp extends LuceneTestCase {
String regexPattern = result.toString();
// Assert our randomly generated regex actually matches the provided raw input using java's expression matcher
- Pattern pattern = Pattern.compile(regexPattern);
+ Pattern pattern = caseSensitiveQuery ? Pattern.compile(regexPattern):
+ Pattern.compile(regexPattern, Pattern.CASE_INSENSITIVE);
+ ;
Matcher matcher = pattern.matcher(docValue);
assertTrue("Java regex " + regexPattern + " did not match doc value " + docValue, matcher.matches());
- RegExp regex = new RegExp(regexPattern);
+ int matchFlags = caseSensitiveQuery ? 0 : RegExp.ASCII_CASE_INSENSITIVE;
+ RegExp regex = new RegExp(regexPattern, RegExp.ALL, matchFlags);
Automaton automaton = regex.toAutomaton();
ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton);
BytesRef br = new BytesRef(docValue);
@@ -228,6 +254,16 @@ public class TestRegExp extends LuceneTestCase {
+ docValue.length(),
bytesMatcher.run(br.bytes, br.offset, br.length)
);
+ if (caseSensitiveQuery == false) {
+ RegExp caseSensitiveRegex = new RegExp(regexPattern);
+ Automaton csAutomaton = caseSensitiveRegex.toAutomaton();
+ ByteRunAutomaton csBytesMatcher = new ByteRunAutomaton(csAutomaton);
+ assertFalse(
+ "[" + regexPattern + "] with case sensitive setting should not match [" + docValue + "]",
+ csBytesMatcher.run(br.bytes, br.offset, br.length)
+ );
+
+ }
return regexPattern;
}