LUCENE-9386 add case insensitive RegExp matching option (#1541)

Added case insensitive search option (currently only works with ASCII characters)
2020-07-08 16:08:12 +01:00 · 2020-07-08 16:08:12 +01:00 · 887fe4c83d
parent 00203c292f
commit 887fe4c83d
4 changed files with 224 additions and 82 deletions
--- a/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java
@ -83,7 +83,7 @@ public class RegexpQuery extends AutomatonQuery {
   * Constructs a query for terms matching <code>term</code>.
   * 
   * @param term regular expression.
-   * @param flags optional RegExp features from {@link RegExp}
+   * @param flags optional RegExp syntax features from {@link RegExp}
   * @param maxDeterminizedStates maximum number of states that compiling the
   *  automaton for the regexp can result in.  Set higher to allow more complex
   *  queries and lower to prevent memory exhaustion.
@ -96,16 +96,46 @@ public class RegexpQuery extends AutomatonQuery {
   * Constructs a query for terms matching <code>term</code>.
   * 
   * @param term regular expression.
-   * @param flags optional RegExp features from {@link RegExp}
+   * @param syntax_flags optional RegExp syntax features from {@link RegExp}
   *  automaton for the regexp can result in.  Set higher to allow more complex
   *  queries and lower to prevent memory exhaustion.
   * @param match_flags boolean 'or' of match behavior options such as case insensitivity
   * @param maxDeterminizedStates maximum number of states that compiling the
   */
  public RegexpQuery(Term term, int syntax_flags, int match_flags, int maxDeterminizedStates) {
    this(term, syntax_flags, match_flags, defaultProvider, maxDeterminizedStates);
  }
  /**
   * Constructs a query for terms matching <code>term</code>.
   * 
   * @param term regular expression.
   * @param syntax_flags optional RegExp features from {@link RegExp}
   * @param provider custom AutomatonProvider for named automata
   * @param maxDeterminizedStates maximum number of states that compiling the
   *  automaton for the regexp can result in.  Set higher to allow more complex
   *  queries and lower to prevent memory exhaustion.
   */
-  public RegexpQuery(Term term, int flags, AutomatonProvider provider,
+  public RegexpQuery(Term term, int syntax_flags, AutomatonProvider provider,
      int maxDeterminizedStates) {
    this(term, syntax_flags, 0, provider, maxDeterminizedStates);
  }
  /**
   * Constructs a query for terms matching <code>term</code>.
   * 
   * @param term regular expression.
   * @param syntax_flags optional RegExp features from {@link RegExp}
   * @param match_flags boolean 'or' of match behavior options such as case insensitivity
   * @param provider custom AutomatonProvider for named automata
   * @param maxDeterminizedStates maximum number of states that compiling the
   *  automaton for the regexp can result in.  Set higher to allow more complex
   *  queries and lower to prevent memory exhaustion.
   */
  public RegexpQuery(Term term, int syntax_flags, int match_flags, AutomatonProvider provider,
      int maxDeterminizedStates) {
    super(term,
-          new RegExp(term.text(), flags).toAutomaton(
+          new RegExp(term.text(), syntax_flags, match_flags).toAutomaton(
                       provider, maxDeterminizedStates), maxDeterminizedStates);
  }
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
@ -32,6 +32,7 @@ package org.apache.lucene.util.automaton;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@ -405,6 +406,7 @@ public class RegExp {
    REGEXP_PRE_CLASS
  }
  //-----  Syntax flags ( <= 0xff )  ------
  /**
   * Syntax flag, enables intersection (<code>&amp;</code>).
   */
@ -439,13 +441,20 @@ public class RegExp {
  /**
   * Syntax flag, enables all optional regexp syntax.
   */
-  public static final int ALL = 0xffff;
+  public static final int ALL = 0xff;
  /**
   * Syntax flag, enables no optional regexp syntax.
   */
  public static final int NONE = 0x0000;
  //-----  Matching flags ( > 0xff )  ------
  /**
   * Allows case insensitive matching of ASCII characters.
   */
  public static final int ASCII_CASE_INSENSITIVE = 0x0100;    
  //Immutable parsed state
  /**
   * The type of expression
@ -474,7 +483,7 @@ public class RegExp {
  // Parser variables
  private final String originalString;
-  int flags;
+  final int flags;
  int pos;
  /**
@ -499,10 +508,30 @@ public class RegExp {
   *              regular expression
   */
  public RegExp(String s, int syntax_flags) throws IllegalArgumentException {
    this(s, syntax_flags, 0);
  }
  /**
   * Constructs new <code>RegExp</code> from a string.
   * 
   * @param s regexp string
   * @param syntax_flags boolean 'or' of optional syntax constructs to be
   *          enabled
   * @param match_flags boolean 'or' of match behavior options such as case insensitivity
   * @exception IllegalArgumentException if an error occurred while parsing the
   *              regular expression
   */
  public RegExp(String s, int syntax_flags, int match_flags) throws IllegalArgumentException {    
    if (syntax_flags >  ALL) {
      throw new IllegalArgumentException("Illegal syntax flag");
    }
    if (match_flags > 0 && match_flags <= ALL) {
      throw new IllegalArgumentException("Illegal match flag");
    }
    flags = syntax_flags | match_flags;
    originalString = s;
    flags = syntax_flags;
    RegExp e;
-    if (s.length() == 0) e = makeString("");
+    if (s.length() == 0) e = makeString(flags, "");
    else {
      e = parseUnionExp();
      if (pos < originalString.length()) throw new IllegalArgumentException(
@ -520,10 +549,10 @@ public class RegExp {
    to = e.to;
  }
-  RegExp(Kind kind, RegExp exp1, RegExp exp2, String s, int c, int min, int max, int digits, int from, int to){    
+  RegExp(int flags, Kind kind, RegExp exp1, RegExp exp2, String s, int c, int min, int max, int digits, int from, int to){    
    this.originalString = null;
    this.kind = kind;
-    this.flags = 0;
+    this.flags = flags;
    this.exp1 = exp1;
    this.exp2 = exp2;
    this.s = s;
@ -536,19 +565,19 @@ public class RegExp {
  }
  // Simplified construction of container nodes
-  static RegExp newContainerNode(Kind kind, RegExp exp1, RegExp exp2) {
+  static RegExp newContainerNode(int flags, Kind kind, RegExp exp1, RegExp exp2) {
-    return new RegExp(kind, exp1, exp2, null, 0, 0, 0, 0, 0, 0);
+    return new RegExp(flags, kind, exp1, exp2, null, 0, 0, 0, 0, 0, 0);
  }
  // Simplified construction of repeating nodes
-  static RegExp newRepeatingNode(Kind kind, RegExp exp,  int min, int max) {
+  static RegExp newRepeatingNode(int flags, Kind kind, RegExp exp,  int min, int max) {
-    return new RegExp(kind, exp, null, null, 0, min, max, 0, 0, 0);
+    return new RegExp(flags, kind, exp, null, null, 0, min, max, 0, 0, 0);
  }  
  // Simplified construction of leaf nodes
-  static RegExp newLeafNode(Kind kind, String s, int c, int min, int max, int digits, int from, int to) {
+  static RegExp newLeafNode(int flags, Kind kind, String s, int c, int min, int max, int digits, int from, int to) {
-    return new RegExp(kind, null, null, s, c, min, max, digits, from, to);
+    return new RegExp(flags, kind, null, null, s, c, min, max, digits, from, to);
  }  
  /**
@ -703,7 +732,11 @@ public class RegExp {
        a = MinimizationOperations.minimize(a, maxDeterminizedStates);
        break;
      case REGEXP_CHAR:
        if (check(ASCII_CASE_INSENSITIVE)) {
          a = toCaseInsensitiveChar(c, maxDeterminizedStates);
        } else {
          a = Automata.makeChar(c);          
        }
        break;
      case REGEXP_CHAR_RANGE:
        a = Automata.makeCharRange(from, to);
@ -715,7 +748,11 @@ public class RegExp {
        a = Automata.makeEmpty();
        break;
      case REGEXP_STRING:
        if (check(ASCII_CASE_INSENSITIVE)) {
          a = toCaseInsensitiveString(maxDeterminizedStates);
        } else {
          a = Automata.makeString(s);
        }
        break;
      case REGEXP_ANYSTRING:
        a = Automata.makeAnyString();
@ -743,6 +780,34 @@ public class RegExp {
    }
    return a;
  }
  private Automaton toCaseInsensitiveChar(int codepoint, int maxDeterminizedStates) {
    Automaton case1 = Automata.makeChar(codepoint);
    // For now we only work with ASCII characters
    if (codepoint > 128) {
      return case1;
    }
    int altCase = Character.isLowerCase(codepoint) ? Character.toUpperCase(codepoint) : Character.toLowerCase(codepoint);
    Automaton result;
    if (altCase != codepoint) {
      result = Operations.union(case1, Automata.makeChar(altCase));
      result = MinimizationOperations.minimize(result, maxDeterminizedStates);          
    } else {
      result = case1;                      
    }          
    return result;
  }
  private Automaton toCaseInsensitiveString(int maxDeterminizedStates) {
    List<Automaton> list = new ArrayList<>();
    Iterator<Integer> iter = s.codePoints().iterator();
    while (iter.hasNext()) {
      list.add(toCaseInsensitiveChar(iter.next(), maxDeterminizedStates));
    }
    Automaton a = Operations.concatenate(list);
    a = MinimizationOperations.minimize(a, maxDeterminizedStates);
    return a;
  }
  private void findLeaves(RegExp exp, Kind kind, List<Automaton> list,
      Map<String,Automaton> automata, AutomatonProvider automaton_provider,
@ -1000,97 +1065,97 @@ public class RegExp {
    }
  }
-  static RegExp makeUnion(RegExp exp1, RegExp exp2) {
+  static RegExp makeUnion(int flags, RegExp exp1, RegExp exp2) {
-    return newContainerNode(Kind.REGEXP_UNION, exp1, exp2);
+    return newContainerNode(flags, Kind.REGEXP_UNION, exp1, exp2);
  }
-  static RegExp makeConcatenation(RegExp exp1, RegExp exp2) {
+  static RegExp makeConcatenation(int flags, RegExp exp1, RegExp exp2) {
    if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING)
        && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) return makeString(
-        exp1, exp2);
+            flags, exp1, exp2);
    RegExp rexp1, rexp2;
    if (exp1.kind == Kind.REGEXP_CONCATENATION
        && (exp1.exp2.kind == Kind.REGEXP_CHAR || exp1.exp2.kind == Kind.REGEXP_STRING)
        && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) {
      rexp1 = exp1.exp1;
-      rexp2 = makeString(exp1.exp2, exp2);
+      rexp2 = makeString(flags, exp1.exp2, exp2);
    } else if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING)
        && exp2.kind == Kind.REGEXP_CONCATENATION
        && (exp2.exp1.kind == Kind.REGEXP_CHAR || exp2.exp1.kind == Kind.REGEXP_STRING)) {
-      rexp1 = makeString(exp1, exp2.exp1);
+      rexp1 = makeString(flags, exp1, exp2.exp1);
      rexp2 = exp2.exp2;
    } else {
      rexp1 = exp1;
      rexp2 = exp2;
    }
-    return newContainerNode(Kind.REGEXP_CONCATENATION, rexp1, rexp2);
+    return newContainerNode(flags, Kind.REGEXP_CONCATENATION, rexp1, rexp2);
  }
-  static private RegExp makeString(RegExp exp1, RegExp exp2) {
+  static private RegExp makeString(int flags, RegExp exp1, RegExp exp2) {
    StringBuilder b = new StringBuilder();
    if (exp1.kind == Kind.REGEXP_STRING) b.append(exp1.s);
    else b.appendCodePoint(exp1.c);
    if (exp2.kind == Kind.REGEXP_STRING) b.append(exp2.s);
    else b.appendCodePoint(exp2.c);
-    return makeString(b.toString());
+    return makeString(flags, b.toString());
  }
-  static RegExp makeIntersection(RegExp exp1, RegExp exp2) {
+  static RegExp makeIntersection(int flags, RegExp exp1, RegExp exp2) {
-    return newContainerNode(Kind.REGEXP_INTERSECTION, exp1, exp2);
+    return newContainerNode(flags, Kind.REGEXP_INTERSECTION, exp1, exp2);
  }
-  static RegExp makeOptional(RegExp exp) {
+  static RegExp makeOptional(int flags, RegExp exp) {
-    return newContainerNode(Kind.REGEXP_OPTIONAL, exp, null);
+    return newContainerNode(flags, Kind.REGEXP_OPTIONAL, exp, null);
  }
-  static RegExp makeRepeat(RegExp exp) {
+  static RegExp makeRepeat(int flags, RegExp exp) {
-    return newContainerNode(Kind.REGEXP_REPEAT, exp, null);
+    return newContainerNode(flags, Kind.REGEXP_REPEAT, exp, null);
  }
-  static RegExp makeRepeat(RegExp exp, int min) {
+  static RegExp makeRepeat(int flags, RegExp exp, int min) {
-    return newRepeatingNode(Kind.REGEXP_REPEAT_MIN, exp, min, 0);
+    return newRepeatingNode(flags, Kind.REGEXP_REPEAT_MIN, exp, min, 0);
  }
-  static RegExp makeRepeat(RegExp exp, int min, int max) {
+  static RegExp makeRepeat(int flags, RegExp exp, int min, int max) {
-    return newRepeatingNode(Kind.REGEXP_REPEAT_MINMAX, exp, min, max);
+    return newRepeatingNode(flags, Kind.REGEXP_REPEAT_MINMAX, exp, min, max);
  }
-  static RegExp makeComplement(RegExp exp) {
+  static RegExp makeComplement(int flags, RegExp exp) {
-    return newContainerNode(Kind.REGEXP_COMPLEMENT, exp, null);
+    return newContainerNode(flags, Kind.REGEXP_COMPLEMENT, exp, null);
  }
-  static RegExp makeChar(int c) {
+  static RegExp makeChar(int flags, int c) {
-    return newLeafNode(Kind.REGEXP_CHAR, null, c, 0, 0, 0, 0, 0);
+    return newLeafNode(flags, Kind.REGEXP_CHAR, null, c, 0, 0, 0, 0, 0);
  }
-  static RegExp makeCharRange(int from, int to) {
+  static RegExp makeCharRange(int flags, int from, int to) {
    if (from > to) 
      throw new IllegalArgumentException("invalid range: from (" + from + ") cannot be > to (" + to + ")");
-    return newLeafNode(Kind.REGEXP_CHAR_RANGE, null, 0, 0, 0, 0, from, to);
+    return newLeafNode(flags, Kind.REGEXP_CHAR_RANGE, null, 0, 0, 0, 0, from, to);
  }
-  static RegExp makeAnyChar() {
+  static RegExp makeAnyChar(int flags) {
-    return newContainerNode(Kind.REGEXP_ANYCHAR, null, null);
+    return newContainerNode(flags, Kind.REGEXP_ANYCHAR, null, null);
  }
-  static RegExp makeEmpty() {
+  static RegExp makeEmpty(int flags) {
-    return newContainerNode(Kind.REGEXP_EMPTY, null, null);
+    return newContainerNode(flags, Kind.REGEXP_EMPTY, null, null);
  }
-  static RegExp makeString(String s) {
+  static RegExp makeString(int flags, String s) {
-    return newLeafNode(Kind.REGEXP_STRING, s, 0, 0, 0, 0, 0, 0);
+    return newLeafNode(flags, Kind.REGEXP_STRING, s, 0, 0, 0, 0, 0, 0);
  }
-  static RegExp makeAnyString() {
+  static RegExp makeAnyString(int flags) {
-    return newContainerNode(Kind.REGEXP_ANYSTRING, null, null);
+    return newContainerNode(flags, Kind.REGEXP_ANYSTRING, null, null);
  }
-  static RegExp makeAutomaton(String s) {
+  static RegExp makeAutomaton(int flags, String s) {
-    return newLeafNode(Kind.REGEXP_AUTOMATON, s, 0, 0, 0, 0, 0, 0);
+    return newLeafNode(flags, Kind.REGEXP_AUTOMATON, s, 0, 0, 0, 0, 0, 0);
  }
-  static RegExp makeInterval(int min, int max, int digits) {
+  static RegExp makeInterval(int flags, int min, int max, int digits) {
-  return newLeafNode(Kind.REGEXP_INTERVAL, null, 0, min, max, digits, 0, 0);
+  return newLeafNode(flags, Kind.REGEXP_INTERVAL, null, 0, min, max, digits, 0, 0);
  }
  private boolean peek(String s) {
@ -1123,13 +1188,13 @@ public class RegExp {
  final RegExp parseUnionExp() throws IllegalArgumentException {
    RegExp e = parseInterExp();
-    if (match('|')) e = makeUnion(e, parseUnionExp());
+    if (match('|')) e = makeUnion(flags, e, parseUnionExp());
    return e;
  }
  final RegExp parseInterExp() throws IllegalArgumentException {
    RegExp e = parseConcatExp();
-    if (check(INTERSECTION) && match('&')) e = makeIntersection(e,
+    if (check(INTERSECTION) && match('&')) e = makeIntersection(flags, e,
        parseInterExp());
    return e;
  }
@ -1137,16 +1202,16 @@ public class RegExp {
  final RegExp parseConcatExp() throws IllegalArgumentException {
    RegExp e = parseRepeatExp();
    if (more() && !peek(")|") && (!check(INTERSECTION) || !peek("&"))) e = makeConcatenation(
-        e, parseConcatExp());
+        flags, e, parseConcatExp());
    return e;
  }
  final RegExp parseRepeatExp() throws IllegalArgumentException {
    RegExp e = parseComplExp();
    while (peek("?*+{")) {
-      if (match('?')) e = makeOptional(e);
+      if (match('?')) e = makeOptional(flags, e);
-      else if (match('*')) e = makeRepeat(e);
+      else if (match('*')) e = makeRepeat(flags, e);
-      else if (match('+')) e = makeRepeat(e, 1);
+      else if (match('+')) e = makeRepeat(flags, e, 1);
      else if (match('{')) {
        int start = pos;
        while (peek("0123456789"))
@ -1164,15 +1229,15 @@ public class RegExp {
        } else m = n;
        if (!match('}')) throw new IllegalArgumentException(
            "expected '}' at position " + pos);
-        if (m == -1) e = makeRepeat(e, n);
+        if (m == -1) e = makeRepeat(flags, e, n);
-        else e = makeRepeat(e, n, m);
+        else e = makeRepeat(flags, e, n, m);
      }
    }
    return e;
  }
  final RegExp parseComplExp() throws IllegalArgumentException {
-    if (check(COMPLEMENT) && match('~')) return makeComplement(parseComplExp());
+    if (check(COMPLEMENT) && match('~')) return makeComplement(flags, parseComplExp());
    else return parseCharClassExp();
  }
@ -1181,7 +1246,7 @@ public class RegExp {
      boolean negate = false;
      if (match('^')) negate = true;
      RegExp e = parseCharClasses();
-      if (negate) e = makeIntersection(makeAnyChar(), makeComplement(e));
+      if (negate) e = makeIntersection(flags, makeAnyChar(flags), makeComplement(flags, e));
      if (!match(']')) throw new IllegalArgumentException(
          "expected ']' at position " + pos);
      return e;
@ -1191,7 +1256,7 @@ public class RegExp {
  final RegExp parseCharClasses() throws IllegalArgumentException {
    RegExp e = parseCharClass();
    while (more() && !peek("]"))
-      e = makeUnion(e, parseCharClass());
+      e = makeUnion(flags, e, parseCharClass());
    return e;
  }
@ -1202,8 +1267,8 @@ public class RegExp {
    }
    int c = parseCharExp();
-    if (match('-')) return makeCharRange(c, parseCharExp());
+    if (match('-')) return makeCharRange(flags, c, parseCharExp());
-    else return makeChar(c);
+    else return makeChar(flags, c);
  }
  RegExp expandPredefined() {
@ -1232,11 +1297,11 @@ public class RegExp {
    //See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
    if (match('\\')) {
      if (peek("dDwWsS")) {
-        return newLeafNode(Kind.REGEXP_PRE_CLASS, null, 0, 0, 0, 0, next(), 0);
+        return newLeafNode(flags, Kind.REGEXP_PRE_CLASS, null, 0, 0, 0, 0, next(), 0);
      }
      if (peek("\\")) {
-        return makeChar(next());
+        return makeChar(flags, next());
      }
      // From https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#bs
@ -1252,18 +1317,18 @@ public class RegExp {
  final RegExp parseSimpleExp() throws IllegalArgumentException {
-    if (match('.')) return makeAnyChar();
+    if (match('.')) return makeAnyChar(flags);
-    else if (check(EMPTY) && match('#')) return makeEmpty();
+    else if (check(EMPTY) && match('#')) return makeEmpty(flags);
-    else if (check(ANYSTRING) && match('@')) return makeAnyString();
+    else if (check(ANYSTRING) && match('@')) return makeAnyString(flags);
    else if (match('"')) {
      int start = pos;
      while (more() && !peek("\""))
        next();
      if (!match('"')) throw new IllegalArgumentException(
          "expected '\"' at position " + pos);
-      return makeString(originalString.substring(start, pos - 1));
+      return makeString(flags, originalString.substring(start, pos - 1));
    } else if (match('(')) {
-      if (match(')')) return makeString("");
+      if (match(')')) return makeString(flags, "");
      RegExp e = parseUnionExp();
      if (!match(')')) throw new IllegalArgumentException(
          "expected ')' at position " + pos);
@ -1279,7 +1344,7 @@ public class RegExp {
      if (i == -1) {
        if (!check(AUTOMATON)) throw new IllegalArgumentException(
            "interval syntax error at position " + (pos - 1));
-        return makeAutomaton(s);
+        return makeAutomaton(flags, s);
      } else {
        if (!check(INTERVAL)) throw new IllegalArgumentException(
            "illegal identifier at position " + (pos - 1));
@ -1297,7 +1362,7 @@ public class RegExp {
            imin = imax;
            imax = t;
          }
-          return makeInterval(imin, imax, digits);
+          return makeInterval(flags, imin, imax, digits);
        } catch (NumberFormatException e) {
          throw new IllegalArgumentException(
              "interval syntax error at position " + (pos - 1));
@ -1308,7 +1373,7 @@ public class RegExp {
      if (predefined != null) {
        return predefined;
      }
-      return makeChar(parseCharExp());
+      return makeChar(flags, parseCharExp());
    }
  }
--- a/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java
@ -73,6 +73,12 @@ public class TestRegexpQuery extends LuceneTestCase {
    return searcher.count(query);
  }
  private long caseInsensitiveRegexQueryNrHits(String regex) throws IOException {
    RegexpQuery query = new RegexpQuery(newTerm(regex), RegExp.ALL, RegExp.ASCII_CASE_INSENSITIVE,
        Operations.DEFAULT_MAX_DETERMINIZED_STATES);
    return searcher.count(query);
  }  
  public void testRegex1() throws IOException {
    assertEquals(1, regexQueryNrHits("q.[aeiou]c.*"));
  }
@ -125,6 +131,11 @@ public class TestRegexpQuery extends LuceneTestCase {
    assertTrue(expected.getMessage().contains("invalid character class"));         
  }  
  public void testCaseInsensitive() throws IOException {
    assertEquals(0, regexQueryNrHits("Quick"));
    assertEquals(1, caseInsensitiveRegexQueryNrHits("Quick"));
  }  
  public void testRegexComplement() throws IOException {
    assertEquals(1, regexQueryNrHits("4934~[3]"));
    // not the empty lang, i.e. match all docs
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java
@ -88,10 +88,14 @@ public class TestRegExp extends LuceneTestCase {
    assertTrue(a.toString().length() > 0);
  }
  boolean caseSensitiveQuery = true;
  public void testCoreJavaParity() {
    // Generate random doc values and random regular expressions
    // and check for same matching behaviour as Java's Pattern class.
    for (int i = 0; i < 1000; i++) {
      caseSensitiveQuery = true;      
      checkRandomExpression(randomDocValue(1 + random().nextInt(30)));
    }        
  }
@ -144,7 +148,7 @@ public class TestRegExp extends LuceneTestCase {
    // Modify the middle...
    String replacementPart = docValue.substring(substitutionPoint, substitutionPoint + substitutionLength);
-    int mutation = random().nextInt(13);
+    int mutation = random().nextInt(15);
    switch (mutation) {
      case 0:
        // OR with random alpha of same length
@ -205,6 +209,25 @@ public class TestRegExp extends LuceneTestCase {
        // Make any whitespace chars replace by whitespace class
        result.append(replacementPart.replaceAll("\\s", "\\\\s"));
        break;
      case 14:
        // Switch case of characters
        StringBuilder switchedCase = new StringBuilder();
        replacementPart.codePoints().forEach(
            p -> {
              int switchedP = p;
              if (Character.isLowerCase(p)) {
                switchedP = Character.toUpperCase(p);
              } else {
                switchedP = Character.toLowerCase(p);                
              }
              switchedCase.appendCodePoint(switchedP);
              if (p != switchedP) {
                caseSensitiveQuery = false;
              }
            }
        );        
        result.append(switchedCase.toString());
        break;
      default:
        break;
    }
@ -215,11 +238,14 @@ public class TestRegExp extends LuceneTestCase {
    String regexPattern = result.toString();
    // Assert our randomly generated regex actually matches the provided raw input using java's expression matcher
-    Pattern pattern = Pattern.compile(regexPattern);
+    Pattern pattern = caseSensitiveQuery ? Pattern.compile(regexPattern): 
                                           Pattern.compile(regexPattern, Pattern.CASE_INSENSITIVE); 
                                             ;
    Matcher matcher = pattern.matcher(docValue);
    assertTrue("Java regex " + regexPattern + " did not match doc value " + docValue, matcher.matches());
-    RegExp regex = new RegExp(regexPattern);
+    int matchFlags = caseSensitiveQuery ? 0 : RegExp.ASCII_CASE_INSENSITIVE;
    RegExp regex =  new RegExp(regexPattern, RegExp.ALL, matchFlags);
    Automaton automaton = regex.toAutomaton();
    ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton);
    BytesRef br = new BytesRef(docValue);
@ -228,6 +254,16 @@ public class TestRegExp extends LuceneTestCase {
            + docValue.length(),
        bytesMatcher.run(br.bytes, br.offset, br.length)
    );
    if (caseSensitiveQuery == false) {
      RegExp caseSensitiveRegex = new RegExp(regexPattern);
      Automaton csAutomaton = caseSensitiveRegex.toAutomaton();
      ByteRunAutomaton csBytesMatcher = new ByteRunAutomaton(csAutomaton);
      assertFalse(
          "[" + regexPattern + "] with case sensitive setting should not match [" + docValue + "]", 
          csBytesMatcher.run(br.bytes, br.offset, br.length)
      );
    }
    return regexPattern;
  }