RegEx querying - add support for Java’s predefined character classes like \d for digits (#1489)

Supports the same \w \W \s \S \d and \D character classes as Java's Pattern matcher.
2020-05-14 10:04:25 +01:00 · 2020-05-14 10:04:25 +01:00 · 1efce5444d
parent 4b9808a03d
commit 1efce5444d
3 changed files with 267 additions and 3 deletions
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
@ -290,6 +290,55 @@ import java.util.Set;
 * <td>(a single non-reserved character)</td>
 * <td></td>
 * </tr>
+ * 
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><code><b>\d</b></code></td>
+ * <td>(a digit [0-9])</td>
+ * <td></td>
+ * </tr>
+ * 
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><code><b>\D</b></code></td>
+ * <td>(a non-digit [^0-9])</td>
+ * <td></td>
+ * </tr>
+ * 
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><code><b>\s</b></code></td>
+ * <td>(whitespace [ \t\n\r])</td>
+ * <td></td>
+ * </tr>
+ * 
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><code><b>\S</b></code></td>
+ * <td>(non whitespace [^\s])</td>
+ * <td></td>
+ * </tr>
+ *  
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><code><b>\w</b></code></td>
+ * <td>(a word character [a-zA-Z_0-9])</td>
+ * <td></td>
+ * </tr>
+ *  
+ * <tr>
+ * <td></td>
+ * <td>|</td>
+ * <td><code><b>\W</b></code></td>
+ * <td>(a non word character [^\w])</td>
+ * <td></td>
+ * </tr>
+ *  
 * <tr>
 * <td></td>
 * <td>|</td>
@ -317,7 +366,8 @@ import java.util.Set;
 public class RegExp {
  
  enum Kind {
-    REGEXP_UNION, REGEXP_CONCATENATION, REGEXP_INTERSECTION, REGEXP_OPTIONAL, REGEXP_REPEAT, REGEXP_REPEAT_MIN, REGEXP_REPEAT_MINMAX, REGEXP_COMPLEMENT, REGEXP_CHAR, REGEXP_CHAR_RANGE, REGEXP_ANYCHAR, REGEXP_EMPTY, REGEXP_STRING, REGEXP_ANYSTRING, REGEXP_AUTOMATON, REGEXP_INTERVAL
+    REGEXP_UNION, REGEXP_CONCATENATION, REGEXP_INTERSECTION, REGEXP_OPTIONAL, REGEXP_REPEAT, REGEXP_REPEAT_MIN, REGEXP_REPEAT_MINMAX, REGEXP_COMPLEMENT, REGEXP_CHAR, REGEXP_CHAR_RANGE, REGEXP_ANYCHAR, REGEXP_EMPTY, REGEXP_STRING, REGEXP_ANYSTRING, REGEXP_AUTOMATON, REGEXP_INTERVAL,
+    REGEXP_PRE_CLASS
  }
  
  /**
@ -506,6 +556,10 @@ public class RegExp {
    List<Automaton> list;
    Automaton a = null;
    switch (kind) {
+      case REGEXP_PRE_CLASS:
+        RegExp expanded = expandPredefined();
+        a = expanded.toAutomatonInternal(automata, automaton_provider, maxDeterminizedStates);
+        break;
      case REGEXP_UNION:
        list = new ArrayList<>();
        findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider,
@ -716,6 +770,9 @@ public class RegExp {
          b.append('0');
        b.append(s2).append(">");
        break;
+      case REGEXP_PRE_CLASS:
+        b.append("\\").appendCodePoint(from);
+        break;
    }
  }

@ -774,6 +831,13 @@ public class RegExp {
        b.appendCodePoint(c);
        b.append('\n');
        break;
+      case REGEXP_PRE_CLASS:
+        b.append(indent);
+        b.append(kind);
+        b.append(" class=\\");
+        b.appendCodePoint(from);
+        b.append('\n');
+        break;        
      case REGEXP_CHAR_RANGE:
        b.append(indent);
        b.append(kind);
@ -1101,10 +1165,51 @@ public class RegExp {
  }
  
  final RegExp parseCharClass() throws IllegalArgumentException {
+    RegExp predefinedExp = matchPredefinedCharacterClass();
+    if (predefinedExp != null) {
+      return predefinedExp;
+    }
+        
    int c = parseCharExp();
    if (match('-')) return makeCharRange(c, parseCharExp());
    else return makeChar(c);
  }
+
+  RegExp expandPredefined() {
+    //See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
+    switch (from) {
+      case 'd':
+        return new RegExp("[0-9]"); // digit
+      case 'D':
+        return new RegExp("[^0-9]"); // non-digit
+      case 's':
+        return new RegExp("[ \t\n\r]"); // whitespace
+      case 'S':
+        return new RegExp("[^\\s]"); // non-whitespace
+      case 'w':
+        return new RegExp("[a-zA-Z_0-9]"); // word
+      case 'W':
+        return new RegExp("[^\\w]"); // non-word
+      default:
+        throw new IllegalArgumentException(
+            "invalid character class " + from);
+      }   
+  }
+
+  
+  final RegExp matchPredefinedCharacterClass() {
+    //See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
+    if (match('\\')) {
+      if (peek("dDwWsS")) {
+        RegExp re =new RegExp();
+        re.kind = Kind.REGEXP_PRE_CLASS;
+        re.from = next();
+        return re;
+      }
+    }    
+    return null;
+  }
+  
  
  final RegExp parseSimpleExp() throws IllegalArgumentException {
    if (match('.')) return makeAnyChar();
@ -1158,7 +1263,13 @@ public class RegExp {
              "interval syntax error at position " + (pos - 1));
        }
      }
-    } else return makeChar(parseCharExp());
+    } else {
+      RegExp predefined = matchPredefinedCharacterClass();
+      if (predefined != null) {
+        return predefined;
+      }
+      return makeChar(parseCharExp());
+    }
  }
  
  final int parseCharExp() throws IllegalArgumentException {
--- a/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java
@ -50,7 +50,7 @@ public class TestRegexpQuery extends LuceneTestCase {
    directory = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
    Document doc = new Document();
-    doc.add(newTextField(FN, "the quick brown fox jumps over the lazy ??? dog 493432 49344", Field.Store.NO));
+    doc.add(newTextField(FN, "the quick brown fox jumps over the lazy ??? dog 493432 49344 [foo] 12.3", Field.Store.NO));
    writer.addDocument(doc);
    reader = writer.getReader();
    writer.close();
@ -90,6 +90,32 @@ public class TestRegexpQuery extends LuceneTestCase {
    assertEquals(0, regexQueryNrHits("<493433-600000>"));
  }
  
+  public void testCharacterClasses() throws IOException {
+    assertEquals(0, regexQueryNrHits("\\d"));
+    assertEquals(1, regexQueryNrHits("\\d*"));
+    assertEquals(1, regexQueryNrHits("\\d{6}"));
+    assertEquals(1, regexQueryNrHits("[a\\d]{6}"));
+    assertEquals(1, regexQueryNrHits("\\d{2,7}"));
+    assertEquals(0, regexQueryNrHits("\\d{4}"));
+    assertEquals(0, regexQueryNrHits("\\dog"));
+    assertEquals(1, regexQueryNrHits("493\\d32"));
+    
+    assertEquals(1, regexQueryNrHits("\\wox"));
+    assertEquals(1, regexQueryNrHits("493\\w32"));
+    assertEquals(1, regexQueryNrHits("\\?\\?\\?"));
+    assertEquals(1, regexQueryNrHits("\\?\\W\\?"));
+    assertEquals(1, regexQueryNrHits("\\?\\S\\?"));
+    
+    assertEquals(1, regexQueryNrHits("\\[foo\\]"));
+    assertEquals(1, regexQueryNrHits("\\[\\w{3}\\]"));
+    
+    assertEquals(0, regexQueryNrHits("\\s.*")); // no matches because all whitespace stripped
+    assertEquals(1, regexQueryNrHits("\\S*ck")); //matches quick
+    assertEquals(1, regexQueryNrHits("[\\d\\.]{3,10}")); // matches 12.3
+    assertEquals(1, regexQueryNrHits("\\d{1,3}(\\.(\\d{1,2}))+")); // matches 12.3
+    
+  }  
+  
  public void testRegexComplement() throws IOException {
    assertEquals(1, regexQueryNrHits("4934~[3]"));
    // not the empty lang, i.e. match all docs
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java
@ -17,8 +17,12 @@
 package org.apache.lucene.util.automaton;


+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;

+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
 public class TestRegExp extends LuceneTestCase {

  /**
@ -83,4 +87,127 @@ public class TestRegExp extends LuceneTestCase {
    a = new RegExp("#?").toAutomaton(1000);
    assertTrue(a.toString().length() > 0);
  }
+  
+  public void testCoreJavaParity() {
+    // Generate random doc values and random regular expressions
+    // and check for same matching behaviour as Java's Pattern class.
+    for (int i = 0; i < 1000; i++) {
+      checkRandomExpression(randomDocValue(1 + random().nextInt(30)));
+    }        
+  }
+
+  static String randomDocValue(int minLength) {
+    String charPalette = "AAAaaaBbbCccc123456 \t";
+    StringBuilder sb = new StringBuilder();
+    for (int i = 0; i < minLength; i++) {
+      sb.append(charPalette.charAt(randomInt(charPalette.length() - 1)));
+    }
+    return sb.toString();
+  }
+
+  private static int randomInt(int bound) {
+    return bound == 0 ? 0 : random().nextInt(bound);
+  }
+
+  protected String checkRandomExpression(String docValue) {
+    // Generate and test a random regular expression which should match the given docValue
+    StringBuilder result = new StringBuilder();
+    // Pick a part of the string to change
+    int substitutionPoint = randomInt(docValue.length() - 1);
+    int substitutionLength = 1 + randomInt(Math.min(10, docValue.length() - substitutionPoint));
+
+    // Add any head to the result, unchanged
+    if (substitutionPoint > 0) {
+      result.append(docValue.substring(0, substitutionPoint));
+    }
+
+    // Modify the middle...
+    String replacementPart = docValue.substring(substitutionPoint, substitutionPoint + substitutionLength);
+    int mutation = random().nextInt(13);
+    switch (mutation) {
+      case 0:
+        // OR with random alpha of same length
+        result.append("(" + replacementPart + "|d" + randomDocValue(replacementPart.length()) + ")");
+        break;
+      case 1:
+        // OR with non-existant value
+        result.append("(" + replacementPart + "|doesnotexist)");
+        break;
+      case 2:
+        // OR with another randomised regex (used to create nested levels of expression).
+        result.append("(" + checkRandomExpression(replacementPart) + "|doesnotexist)");
+        break;
+      case 3:
+        // Star-replace all ab sequences.
+        result.append(replacementPart.replaceAll("ab", ".*"));
+        break;
+      case 4:
+        // .-replace all b chars
+        result.append(replacementPart.replaceAll("b", "."));
+        break;
+      case 5:
+        // length-limited stars {1,2}
+        result.append(".{1," + replacementPart.length() + "}");
+        break;
+      case 6:
+        // replace all chars with .
+        result.append(replacementPart.replaceAll(".", "."));
+        break;
+      case 7:
+        // OR with uppercase chars eg [aA] (many of these sorts of expression in the wild..
+        char[] chars = replacementPart.toCharArray();
+        for (char c : chars) {
+          result.append("[" + c + Character.toUpperCase(c) + "]");
+        }
+        break;
+      case 8:
+        // NOT a character - replace all b's with "not a"
+        result.append(replacementPart.replaceAll("b", "[^a]"));
+        break;
+      case 9:
+        // Make whole part repeatable 1 or more times
+        result.append("(" + replacementPart + ")+");
+        break;
+      case 10:
+        // Make whole part repeatable 0 or more times
+        result.append("(" + replacementPart + ")?");
+        break;
+      case 11:
+        // Make any digits replaced by character class
+        result.append(replacementPart.replaceAll("\\d", "\\\\d"));
+        break;
+      case 12:
+        // Make any whitespace chars replaced by not word class
+        result.append(replacementPart.replaceAll("\\s", "\\\\W"));
+        break;
+      case 13:
+        // Make any whitespace chars replace by whitespace class
+        result.append(replacementPart.replaceAll("\\s", "\\\\s"));
+        break;
+      default:
+        break;
+    }
+    // add any remaining tail, unchanged
+    if (substitutionPoint + substitutionLength <= docValue.length() - 1) {
+      result.append(docValue.substring(substitutionPoint + substitutionLength));
+    }
+
+    String regexPattern = result.toString();
+    // Assert our randomly generated regex actually matches the provided raw input using java's expression matcher
+    Pattern pattern = Pattern.compile(regexPattern);
+    Matcher matcher = pattern.matcher(docValue);
+    assertTrue("Java regex " + regexPattern + " did not match doc value " + docValue, matcher.matches());
+
+    RegExp regex = new RegExp(regexPattern);
+    Automaton automaton = regex.toAutomaton();
+    ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton);
+    BytesRef br = new BytesRef(docValue);
+    assertTrue(
+        "[" + regexPattern + "]should match [" + docValue + "]" + substitutionPoint + "-" + substitutionLength + "/"
+            + docValue.length(),
+        bytesMatcher.run(br.bytes, br.offset, br.length)
+    );
+    return regexPattern;
+  }
+  
 }