diff --git a/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java b/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java index b3c8ad2e2d5..b241ac5f873 100644 --- a/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java @@ -83,7 +83,7 @@ public class RegexpQuery extends AutomatonQuery { * Constructs a query for terms matching term. * * @param term regular expression. - * @param flags optional RegExp features from {@link RegExp} + * @param flags optional RegExp syntax features from {@link RegExp} * @param maxDeterminizedStates maximum number of states that compiling the * automaton for the regexp can result in. Set higher to allow more complex * queries and lower to prevent memory exhaustion. @@ -96,16 +96,46 @@ public class RegexpQuery extends AutomatonQuery { * Constructs a query for terms matching term. * * @param term regular expression. - * @param flags optional RegExp features from {@link RegExp} + * @param syntax_flags optional RegExp syntax features from {@link RegExp} + * automaton for the regexp can result in. Set higher to allow more complex + * queries and lower to prevent memory exhaustion. + * @param match_flags boolean 'or' of match behavior options such as case insensitivity + * @param maxDeterminizedStates maximum number of states that compiling the + */ + public RegexpQuery(Term term, int syntax_flags, int match_flags, int maxDeterminizedStates) { + this(term, syntax_flags, match_flags, defaultProvider, maxDeterminizedStates); + } + + /** + * Constructs a query for terms matching term. + * + * @param term regular expression. + * @param syntax_flags optional RegExp features from {@link RegExp} * @param provider custom AutomatonProvider for named automata * @param maxDeterminizedStates maximum number of states that compiling the * automaton for the regexp can result in. Set higher to allow more complex * queries and lower to prevent memory exhaustion. */ - public RegexpQuery(Term term, int flags, AutomatonProvider provider, + public RegexpQuery(Term term, int syntax_flags, AutomatonProvider provider, + int maxDeterminizedStates) { + this(term, syntax_flags, 0, provider, maxDeterminizedStates); + } + + /** + * Constructs a query for terms matching term. + * + * @param term regular expression. + * @param syntax_flags optional RegExp features from {@link RegExp} + * @param match_flags boolean 'or' of match behavior options such as case insensitivity + * @param provider custom AutomatonProvider for named automata + * @param maxDeterminizedStates maximum number of states that compiling the + * automaton for the regexp can result in. Set higher to allow more complex + * queries and lower to prevent memory exhaustion. + */ + public RegexpQuery(Term term, int syntax_flags, int match_flags, AutomatonProvider provider, int maxDeterminizedStates) { super(term, - new RegExp(term.text(), flags).toAutomaton( + new RegExp(term.text(), syntax_flags, match_flags).toAutomaton( provider, maxDeterminizedStates), maxDeterminizedStates); } diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java index 0874cde7101..59cf2c41e00 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java @@ -32,6 +32,7 @@ package org.apache.lucene.util.automaton; import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; @@ -405,6 +406,7 @@ public class RegExp { REGEXP_PRE_CLASS } + //----- Syntax flags ( <= 0xff ) ------ /** * Syntax flag, enables intersection (&). */ @@ -439,12 +441,19 @@ public class RegExp { /** * Syntax flag, enables all optional regexp syntax. */ - public static final int ALL = 0xffff; - + public static final int ALL = 0xff; + /** * Syntax flag, enables no optional regexp syntax. */ public static final int NONE = 0x0000; + + //----- Matching flags ( > 0xff ) ------ + + /** + * Allows case insensitive matching of ASCII characters. + */ + public static final int ASCII_CASE_INSENSITIVE = 0x0100; //Immutable parsed state /** @@ -474,7 +483,7 @@ public class RegExp { // Parser variables private final String originalString; - int flags; + final int flags; int pos; /** @@ -499,10 +508,30 @@ public class RegExp { * regular expression */ public RegExp(String s, int syntax_flags) throws IllegalArgumentException { + this(s, syntax_flags, 0); + } + /** + * Constructs new RegExp from a string. + * + * @param s regexp string + * @param syntax_flags boolean 'or' of optional syntax constructs to be + * enabled + * @param match_flags boolean 'or' of match behavior options such as case insensitivity + * @exception IllegalArgumentException if an error occurred while parsing the + * regular expression + */ + public RegExp(String s, int syntax_flags, int match_flags) throws IllegalArgumentException { + if (syntax_flags > ALL) { + throw new IllegalArgumentException("Illegal syntax flag"); + } + + if (match_flags > 0 && match_flags <= ALL) { + throw new IllegalArgumentException("Illegal match flag"); + } + flags = syntax_flags | match_flags; originalString = s; - flags = syntax_flags; RegExp e; - if (s.length() == 0) e = makeString(""); + if (s.length() == 0) e = makeString(flags, ""); else { e = parseUnionExp(); if (pos < originalString.length()) throw new IllegalArgumentException( @@ -520,10 +549,10 @@ public class RegExp { to = e.to; } - RegExp(Kind kind, RegExp exp1, RegExp exp2, String s, int c, int min, int max, int digits, int from, int to){ + RegExp(int flags, Kind kind, RegExp exp1, RegExp exp2, String s, int c, int min, int max, int digits, int from, int to){ this.originalString = null; this.kind = kind; - this.flags = 0; + this.flags = flags; this.exp1 = exp1; this.exp2 = exp2; this.s = s; @@ -536,19 +565,19 @@ public class RegExp { } // Simplified construction of container nodes - static RegExp newContainerNode(Kind kind, RegExp exp1, RegExp exp2) { - return new RegExp(kind, exp1, exp2, null, 0, 0, 0, 0, 0, 0); + static RegExp newContainerNode(int flags, Kind kind, RegExp exp1, RegExp exp2) { + return new RegExp(flags, kind, exp1, exp2, null, 0, 0, 0, 0, 0, 0); } // Simplified construction of repeating nodes - static RegExp newRepeatingNode(Kind kind, RegExp exp, int min, int max) { - return new RegExp(kind, exp, null, null, 0, min, max, 0, 0, 0); + static RegExp newRepeatingNode(int flags, Kind kind, RegExp exp, int min, int max) { + return new RegExp(flags, kind, exp, null, null, 0, min, max, 0, 0, 0); } // Simplified construction of leaf nodes - static RegExp newLeafNode(Kind kind, String s, int c, int min, int max, int digits, int from, int to) { - return new RegExp(kind, null, null, s, c, min, max, digits, from, to); + static RegExp newLeafNode(int flags, Kind kind, String s, int c, int min, int max, int digits, int from, int to) { + return new RegExp(flags, kind, null, null, s, c, min, max, digits, from, to); } /** @@ -703,7 +732,11 @@ public class RegExp { a = MinimizationOperations.minimize(a, maxDeterminizedStates); break; case REGEXP_CHAR: - a = Automata.makeChar(c); + if (check(ASCII_CASE_INSENSITIVE)) { + a = toCaseInsensitiveChar(c, maxDeterminizedStates); + } else { + a = Automata.makeChar(c); + } break; case REGEXP_CHAR_RANGE: a = Automata.makeCharRange(from, to); @@ -715,7 +748,11 @@ public class RegExp { a = Automata.makeEmpty(); break; case REGEXP_STRING: - a = Automata.makeString(s); + if (check(ASCII_CASE_INSENSITIVE)) { + a = toCaseInsensitiveString(maxDeterminizedStates); + } else { + a = Automata.makeString(s); + } break; case REGEXP_ANYSTRING: a = Automata.makeAnyString(); @@ -743,6 +780,34 @@ public class RegExp { } return a; } + private Automaton toCaseInsensitiveChar(int codepoint, int maxDeterminizedStates) { + Automaton case1 = Automata.makeChar(codepoint); + // For now we only work with ASCII characters + if (codepoint > 128) { + return case1; + } + int altCase = Character.isLowerCase(codepoint) ? Character.toUpperCase(codepoint) : Character.toLowerCase(codepoint); + Automaton result; + if (altCase != codepoint) { + result = Operations.union(case1, Automata.makeChar(altCase)); + result = MinimizationOperations.minimize(result, maxDeterminizedStates); + } else { + result = case1; + } + return result; + } + + private Automaton toCaseInsensitiveString(int maxDeterminizedStates) { + List list = new ArrayList<>(); + + Iterator iter = s.codePoints().iterator(); + while (iter.hasNext()) { + list.add(toCaseInsensitiveChar(iter.next(), maxDeterminizedStates)); + } + Automaton a = Operations.concatenate(list); + a = MinimizationOperations.minimize(a, maxDeterminizedStates); + return a; + } private void findLeaves(RegExp exp, Kind kind, List list, Map automata, AutomatonProvider automaton_provider, @@ -1000,97 +1065,97 @@ public class RegExp { } } - static RegExp makeUnion(RegExp exp1, RegExp exp2) { - return newContainerNode(Kind.REGEXP_UNION, exp1, exp2); + static RegExp makeUnion(int flags, RegExp exp1, RegExp exp2) { + return newContainerNode(flags, Kind.REGEXP_UNION, exp1, exp2); } - static RegExp makeConcatenation(RegExp exp1, RegExp exp2) { + static RegExp makeConcatenation(int flags, RegExp exp1, RegExp exp2) { if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING) && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) return makeString( - exp1, exp2); + flags, exp1, exp2); RegExp rexp1, rexp2; if (exp1.kind == Kind.REGEXP_CONCATENATION && (exp1.exp2.kind == Kind.REGEXP_CHAR || exp1.exp2.kind == Kind.REGEXP_STRING) && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) { rexp1 = exp1.exp1; - rexp2 = makeString(exp1.exp2, exp2); + rexp2 = makeString(flags, exp1.exp2, exp2); } else if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING) && exp2.kind == Kind.REGEXP_CONCATENATION && (exp2.exp1.kind == Kind.REGEXP_CHAR || exp2.exp1.kind == Kind.REGEXP_STRING)) { - rexp1 = makeString(exp1, exp2.exp1); + rexp1 = makeString(flags, exp1, exp2.exp1); rexp2 = exp2.exp2; } else { rexp1 = exp1; rexp2 = exp2; } - return newContainerNode(Kind.REGEXP_CONCATENATION, rexp1, rexp2); + return newContainerNode(flags, Kind.REGEXP_CONCATENATION, rexp1, rexp2); } - static private RegExp makeString(RegExp exp1, RegExp exp2) { + static private RegExp makeString(int flags, RegExp exp1, RegExp exp2) { StringBuilder b = new StringBuilder(); if (exp1.kind == Kind.REGEXP_STRING) b.append(exp1.s); else b.appendCodePoint(exp1.c); if (exp2.kind == Kind.REGEXP_STRING) b.append(exp2.s); else b.appendCodePoint(exp2.c); - return makeString(b.toString()); + return makeString(flags, b.toString()); } - static RegExp makeIntersection(RegExp exp1, RegExp exp2) { - return newContainerNode(Kind.REGEXP_INTERSECTION, exp1, exp2); + static RegExp makeIntersection(int flags, RegExp exp1, RegExp exp2) { + return newContainerNode(flags, Kind.REGEXP_INTERSECTION, exp1, exp2); } - static RegExp makeOptional(RegExp exp) { - return newContainerNode(Kind.REGEXP_OPTIONAL, exp, null); + static RegExp makeOptional(int flags, RegExp exp) { + return newContainerNode(flags, Kind.REGEXP_OPTIONAL, exp, null); } - static RegExp makeRepeat(RegExp exp) { - return newContainerNode(Kind.REGEXP_REPEAT, exp, null); + static RegExp makeRepeat(int flags, RegExp exp) { + return newContainerNode(flags, Kind.REGEXP_REPEAT, exp, null); } - static RegExp makeRepeat(RegExp exp, int min) { - return newRepeatingNode(Kind.REGEXP_REPEAT_MIN, exp, min, 0); + static RegExp makeRepeat(int flags, RegExp exp, int min) { + return newRepeatingNode(flags, Kind.REGEXP_REPEAT_MIN, exp, min, 0); } - static RegExp makeRepeat(RegExp exp, int min, int max) { - return newRepeatingNode(Kind.REGEXP_REPEAT_MINMAX, exp, min, max); + static RegExp makeRepeat(int flags, RegExp exp, int min, int max) { + return newRepeatingNode(flags, Kind.REGEXP_REPEAT_MINMAX, exp, min, max); } - static RegExp makeComplement(RegExp exp) { - return newContainerNode(Kind.REGEXP_COMPLEMENT, exp, null); + static RegExp makeComplement(int flags, RegExp exp) { + return newContainerNode(flags, Kind.REGEXP_COMPLEMENT, exp, null); } - static RegExp makeChar(int c) { - return newLeafNode(Kind.REGEXP_CHAR, null, c, 0, 0, 0, 0, 0); + static RegExp makeChar(int flags, int c) { + return newLeafNode(flags, Kind.REGEXP_CHAR, null, c, 0, 0, 0, 0, 0); } - static RegExp makeCharRange(int from, int to) { + static RegExp makeCharRange(int flags, int from, int to) { if (from > to) throw new IllegalArgumentException("invalid range: from (" + from + ") cannot be > to (" + to + ")"); - return newLeafNode(Kind.REGEXP_CHAR_RANGE, null, 0, 0, 0, 0, from, to); + return newLeafNode(flags, Kind.REGEXP_CHAR_RANGE, null, 0, 0, 0, 0, from, to); } - static RegExp makeAnyChar() { - return newContainerNode(Kind.REGEXP_ANYCHAR, null, null); + static RegExp makeAnyChar(int flags) { + return newContainerNode(flags, Kind.REGEXP_ANYCHAR, null, null); } - static RegExp makeEmpty() { - return newContainerNode(Kind.REGEXP_EMPTY, null, null); + static RegExp makeEmpty(int flags) { + return newContainerNode(flags, Kind.REGEXP_EMPTY, null, null); } - static RegExp makeString(String s) { - return newLeafNode(Kind.REGEXP_STRING, s, 0, 0, 0, 0, 0, 0); + static RegExp makeString(int flags, String s) { + return newLeafNode(flags, Kind.REGEXP_STRING, s, 0, 0, 0, 0, 0, 0); } - static RegExp makeAnyString() { - return newContainerNode(Kind.REGEXP_ANYSTRING, null, null); + static RegExp makeAnyString(int flags) { + return newContainerNode(flags, Kind.REGEXP_ANYSTRING, null, null); } - static RegExp makeAutomaton(String s) { - return newLeafNode(Kind.REGEXP_AUTOMATON, s, 0, 0, 0, 0, 0, 0); + static RegExp makeAutomaton(int flags, String s) { + return newLeafNode(flags, Kind.REGEXP_AUTOMATON, s, 0, 0, 0, 0, 0, 0); } - static RegExp makeInterval(int min, int max, int digits) { - return newLeafNode(Kind.REGEXP_INTERVAL, null, 0, min, max, digits, 0, 0); + static RegExp makeInterval(int flags, int min, int max, int digits) { + return newLeafNode(flags, Kind.REGEXP_INTERVAL, null, 0, min, max, digits, 0, 0); } private boolean peek(String s) { @@ -1123,13 +1188,13 @@ public class RegExp { final RegExp parseUnionExp() throws IllegalArgumentException { RegExp e = parseInterExp(); - if (match('|')) e = makeUnion(e, parseUnionExp()); + if (match('|')) e = makeUnion(flags, e, parseUnionExp()); return e; } final RegExp parseInterExp() throws IllegalArgumentException { RegExp e = parseConcatExp(); - if (check(INTERSECTION) && match('&')) e = makeIntersection(e, + if (check(INTERSECTION) && match('&')) e = makeIntersection(flags, e, parseInterExp()); return e; } @@ -1137,16 +1202,16 @@ public class RegExp { final RegExp parseConcatExp() throws IllegalArgumentException { RegExp e = parseRepeatExp(); if (more() && !peek(")|") && (!check(INTERSECTION) || !peek("&"))) e = makeConcatenation( - e, parseConcatExp()); + flags, e, parseConcatExp()); return e; } final RegExp parseRepeatExp() throws IllegalArgumentException { RegExp e = parseComplExp(); while (peek("?*+{")) { - if (match('?')) e = makeOptional(e); - else if (match('*')) e = makeRepeat(e); - else if (match('+')) e = makeRepeat(e, 1); + if (match('?')) e = makeOptional(flags, e); + else if (match('*')) e = makeRepeat(flags, e); + else if (match('+')) e = makeRepeat(flags, e, 1); else if (match('{')) { int start = pos; while (peek("0123456789")) @@ -1164,15 +1229,15 @@ public class RegExp { } else m = n; if (!match('}')) throw new IllegalArgumentException( "expected '}' at position " + pos); - if (m == -1) e = makeRepeat(e, n); - else e = makeRepeat(e, n, m); + if (m == -1) e = makeRepeat(flags, e, n); + else e = makeRepeat(flags, e, n, m); } } return e; } final RegExp parseComplExp() throws IllegalArgumentException { - if (check(COMPLEMENT) && match('~')) return makeComplement(parseComplExp()); + if (check(COMPLEMENT) && match('~')) return makeComplement(flags, parseComplExp()); else return parseCharClassExp(); } @@ -1181,7 +1246,7 @@ public class RegExp { boolean negate = false; if (match('^')) negate = true; RegExp e = parseCharClasses(); - if (negate) e = makeIntersection(makeAnyChar(), makeComplement(e)); + if (negate) e = makeIntersection(flags, makeAnyChar(flags), makeComplement(flags, e)); if (!match(']')) throw new IllegalArgumentException( "expected ']' at position " + pos); return e; @@ -1191,7 +1256,7 @@ public class RegExp { final RegExp parseCharClasses() throws IllegalArgumentException { RegExp e = parseCharClass(); while (more() && !peek("]")) - e = makeUnion(e, parseCharClass()); + e = makeUnion(flags, e, parseCharClass()); return e; } @@ -1202,8 +1267,8 @@ public class RegExp { } int c = parseCharExp(); - if (match('-')) return makeCharRange(c, parseCharExp()); - else return makeChar(c); + if (match('-')) return makeCharRange(flags, c, parseCharExp()); + else return makeChar(flags, c); } RegExp expandPredefined() { @@ -1232,11 +1297,11 @@ public class RegExp { //See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html if (match('\\')) { if (peek("dDwWsS")) { - return newLeafNode(Kind.REGEXP_PRE_CLASS, null, 0, 0, 0, 0, next(), 0); + return newLeafNode(flags, Kind.REGEXP_PRE_CLASS, null, 0, 0, 0, 0, next(), 0); } if (peek("\\")) { - return makeChar(next()); + return makeChar(flags, next()); } // From https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#bs @@ -1252,18 +1317,18 @@ public class RegExp { final RegExp parseSimpleExp() throws IllegalArgumentException { - if (match('.')) return makeAnyChar(); - else if (check(EMPTY) && match('#')) return makeEmpty(); - else if (check(ANYSTRING) && match('@')) return makeAnyString(); + if (match('.')) return makeAnyChar(flags); + else if (check(EMPTY) && match('#')) return makeEmpty(flags); + else if (check(ANYSTRING) && match('@')) return makeAnyString(flags); else if (match('"')) { int start = pos; while (more() && !peek("\"")) next(); if (!match('"')) throw new IllegalArgumentException( "expected '\"' at position " + pos); - return makeString(originalString.substring(start, pos - 1)); + return makeString(flags, originalString.substring(start, pos - 1)); } else if (match('(')) { - if (match(')')) return makeString(""); + if (match(')')) return makeString(flags, ""); RegExp e = parseUnionExp(); if (!match(')')) throw new IllegalArgumentException( "expected ')' at position " + pos); @@ -1279,7 +1344,7 @@ public class RegExp { if (i == -1) { if (!check(AUTOMATON)) throw new IllegalArgumentException( "interval syntax error at position " + (pos - 1)); - return makeAutomaton(s); + return makeAutomaton(flags, s); } else { if (!check(INTERVAL)) throw new IllegalArgumentException( "illegal identifier at position " + (pos - 1)); @@ -1297,7 +1362,7 @@ public class RegExp { imin = imax; imax = t; } - return makeInterval(imin, imax, digits); + return makeInterval(flags, imin, imax, digits); } catch (NumberFormatException e) { throw new IllegalArgumentException( "interval syntax error at position " + (pos - 1)); @@ -1308,7 +1373,7 @@ public class RegExp { if (predefined != null) { return predefined; } - return makeChar(parseCharExp()); + return makeChar(flags, parseCharExp()); } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java index 767ee20a7d4..1db75ac4fe6 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java @@ -73,6 +73,12 @@ public class TestRegexpQuery extends LuceneTestCase { return searcher.count(query); } + private long caseInsensitiveRegexQueryNrHits(String regex) throws IOException { + RegexpQuery query = new RegexpQuery(newTerm(regex), RegExp.ALL, RegExp.ASCII_CASE_INSENSITIVE, + Operations.DEFAULT_MAX_DETERMINIZED_STATES); + return searcher.count(query); + } + public void testRegex1() throws IOException { assertEquals(1, regexQueryNrHits("q.[aeiou]c.*")); } @@ -125,6 +131,11 @@ public class TestRegexpQuery extends LuceneTestCase { assertTrue(expected.getMessage().contains("invalid character class")); } + public void testCaseInsensitive() throws IOException { + assertEquals(0, regexQueryNrHits("Quick")); + assertEquals(1, caseInsensitiveRegexQueryNrHits("Quick")); + } + public void testRegexComplement() throws IOException { assertEquals(1, regexQueryNrHits("4934~[3]")); // not the empty lang, i.e. match all docs diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java index ce36eacfb10..26a76fb9a18 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java @@ -88,10 +88,14 @@ public class TestRegExp extends LuceneTestCase { assertTrue(a.toString().length() > 0); } + + boolean caseSensitiveQuery = true; + public void testCoreJavaParity() { // Generate random doc values and random regular expressions // and check for same matching behaviour as Java's Pattern class. for (int i = 0; i < 1000; i++) { + caseSensitiveQuery = true; checkRandomExpression(randomDocValue(1 + random().nextInt(30))); } } @@ -144,7 +148,7 @@ public class TestRegExp extends LuceneTestCase { // Modify the middle... String replacementPart = docValue.substring(substitutionPoint, substitutionPoint + substitutionLength); - int mutation = random().nextInt(13); + int mutation = random().nextInt(15); switch (mutation) { case 0: // OR with random alpha of same length @@ -205,6 +209,25 @@ public class TestRegExp extends LuceneTestCase { // Make any whitespace chars replace by whitespace class result.append(replacementPart.replaceAll("\\s", "\\\\s")); break; + case 14: + // Switch case of characters + StringBuilder switchedCase = new StringBuilder(); + replacementPart.codePoints().forEach( + p -> { + int switchedP = p; + if (Character.isLowerCase(p)) { + switchedP = Character.toUpperCase(p); + } else { + switchedP = Character.toLowerCase(p); + } + switchedCase.appendCodePoint(switchedP); + if (p != switchedP) { + caseSensitiveQuery = false; + } + } + ); + result.append(switchedCase.toString()); + break; default: break; } @@ -215,11 +238,14 @@ public class TestRegExp extends LuceneTestCase { String regexPattern = result.toString(); // Assert our randomly generated regex actually matches the provided raw input using java's expression matcher - Pattern pattern = Pattern.compile(regexPattern); + Pattern pattern = caseSensitiveQuery ? Pattern.compile(regexPattern): + Pattern.compile(regexPattern, Pattern.CASE_INSENSITIVE); + ; Matcher matcher = pattern.matcher(docValue); assertTrue("Java regex " + regexPattern + " did not match doc value " + docValue, matcher.matches()); - RegExp regex = new RegExp(regexPattern); + int matchFlags = caseSensitiveQuery ? 0 : RegExp.ASCII_CASE_INSENSITIVE; + RegExp regex = new RegExp(regexPattern, RegExp.ALL, matchFlags); Automaton automaton = regex.toAutomaton(); ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton); BytesRef br = new BytesRef(docValue); @@ -228,6 +254,16 @@ public class TestRegExp extends LuceneTestCase { + docValue.length(), bytesMatcher.run(br.bytes, br.offset, br.length) ); + if (caseSensitiveQuery == false) { + RegExp caseSensitiveRegex = new RegExp(regexPattern); + Automaton csAutomaton = caseSensitiveRegex.toAutomaton(); + ByteRunAutomaton csBytesMatcher = new ByteRunAutomaton(csAutomaton); + assertFalse( + "[" + regexPattern + "] with case sensitive setting should not match [" + docValue + "]", + csBytesMatcher.run(br.bytes, br.offset, br.length) + ); + + } return regexPattern; }