diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
index e8b37e59a3f..d0445026e9c 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
@@ -290,6 +290,55 @@ import java.util.Set;
*
(a single non-reserved character) |
* |
*
+ *
+ *
+ * |
+ * | |
+ * \d |
+ * (a digit [0-9]) |
+ * |
+ *
+ *
+ *
+ * |
+ * | |
+ * \D |
+ * (a non-digit [^0-9]) |
+ * |
+ *
+ *
+ *
+ * |
+ * | |
+ * \s |
+ * (whitespace [ \t\n\r]) |
+ * |
+ *
+ *
+ *
+ * |
+ * | |
+ * \S |
+ * (non whitespace [^\s]) |
+ * |
+ *
+ *
+ *
+ * |
+ * | |
+ * \w |
+ * (a word character [a-zA-Z_0-9]) |
+ * |
+ *
+ *
+ *
+ * |
+ * | |
+ * \W |
+ * (a non word character [^\w]) |
+ * |
+ *
+ *
*
* |
* | |
@@ -317,7 +366,8 @@ import java.util.Set;
public class RegExp {
enum Kind {
- REGEXP_UNION, REGEXP_CONCATENATION, REGEXP_INTERSECTION, REGEXP_OPTIONAL, REGEXP_REPEAT, REGEXP_REPEAT_MIN, REGEXP_REPEAT_MINMAX, REGEXP_COMPLEMENT, REGEXP_CHAR, REGEXP_CHAR_RANGE, REGEXP_ANYCHAR, REGEXP_EMPTY, REGEXP_STRING, REGEXP_ANYSTRING, REGEXP_AUTOMATON, REGEXP_INTERVAL
+ REGEXP_UNION, REGEXP_CONCATENATION, REGEXP_INTERSECTION, REGEXP_OPTIONAL, REGEXP_REPEAT, REGEXP_REPEAT_MIN, REGEXP_REPEAT_MINMAX, REGEXP_COMPLEMENT, REGEXP_CHAR, REGEXP_CHAR_RANGE, REGEXP_ANYCHAR, REGEXP_EMPTY, REGEXP_STRING, REGEXP_ANYSTRING, REGEXP_AUTOMATON, REGEXP_INTERVAL,
+ REGEXP_PRE_CLASS
}
/**
@@ -506,6 +556,10 @@ public class RegExp {
List list;
Automaton a = null;
switch (kind) {
+ case REGEXP_PRE_CLASS:
+ RegExp expanded = expandPredefined();
+ a = expanded.toAutomatonInternal(automata, automaton_provider, maxDeterminizedStates);
+ break;
case REGEXP_UNION:
list = new ArrayList<>();
findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider,
@@ -716,6 +770,9 @@ public class RegExp {
b.append('0');
b.append(s2).append(">");
break;
+ case REGEXP_PRE_CLASS:
+ b.append("\\").appendCodePoint(from);
+ break;
}
}
@@ -774,6 +831,13 @@ public class RegExp {
b.appendCodePoint(c);
b.append('\n');
break;
+ case REGEXP_PRE_CLASS:
+ b.append(indent);
+ b.append(kind);
+ b.append(" class=\\");
+ b.appendCodePoint(from);
+ b.append('\n');
+ break;
case REGEXP_CHAR_RANGE:
b.append(indent);
b.append(kind);
@@ -1101,10 +1165,51 @@ public class RegExp {
}
final RegExp parseCharClass() throws IllegalArgumentException {
+ RegExp predefinedExp = matchPredefinedCharacterClass();
+ if (predefinedExp != null) {
+ return predefinedExp;
+ }
+
int c = parseCharExp();
if (match('-')) return makeCharRange(c, parseCharExp());
else return makeChar(c);
}
+
+ RegExp expandPredefined() {
+ //See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
+ switch (from) {
+ case 'd':
+ return new RegExp("[0-9]"); // digit
+ case 'D':
+ return new RegExp("[^0-9]"); // non-digit
+ case 's':
+ return new RegExp("[ \t\n\r]"); // whitespace
+ case 'S':
+ return new RegExp("[^\\s]"); // non-whitespace
+ case 'w':
+ return new RegExp("[a-zA-Z_0-9]"); // word
+ case 'W':
+ return new RegExp("[^\\w]"); // non-word
+ default:
+ throw new IllegalArgumentException(
+ "invalid character class " + from);
+ }
+ }
+
+
+ final RegExp matchPredefinedCharacterClass() {
+ //See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
+ if (match('\\')) {
+ if (peek("dDwWsS")) {
+ RegExp re =new RegExp();
+ re.kind = Kind.REGEXP_PRE_CLASS;
+ re.from = next();
+ return re;
+ }
+ }
+ return null;
+ }
+
final RegExp parseSimpleExp() throws IllegalArgumentException {
if (match('.')) return makeAnyChar();
@@ -1158,7 +1263,13 @@ public class RegExp {
"interval syntax error at position " + (pos - 1));
}
}
- } else return makeChar(parseCharExp());
+ } else {
+ RegExp predefined = matchPredefinedCharacterClass();
+ if (predefined != null) {
+ return predefined;
+ }
+ return makeChar(parseCharExp());
+ }
}
final int parseCharExp() throws IllegalArgumentException {
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java
index 6a8e183e0d4..d760a4a1680 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java
@@ -50,7 +50,7 @@ public class TestRegexpQuery extends LuceneTestCase {
directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
Document doc = new Document();
- doc.add(newTextField(FN, "the quick brown fox jumps over the lazy ??? dog 493432 49344", Field.Store.NO));
+ doc.add(newTextField(FN, "the quick brown fox jumps over the lazy ??? dog 493432 49344 [foo] 12.3", Field.Store.NO));
writer.addDocument(doc);
reader = writer.getReader();
writer.close();
@@ -90,6 +90,32 @@ public class TestRegexpQuery extends LuceneTestCase {
assertEquals(0, regexQueryNrHits("<493433-600000>"));
}
+ public void testCharacterClasses() throws IOException {
+ assertEquals(0, regexQueryNrHits("\\d"));
+ assertEquals(1, regexQueryNrHits("\\d*"));
+ assertEquals(1, regexQueryNrHits("\\d{6}"));
+ assertEquals(1, regexQueryNrHits("[a\\d]{6}"));
+ assertEquals(1, regexQueryNrHits("\\d{2,7}"));
+ assertEquals(0, regexQueryNrHits("\\d{4}"));
+ assertEquals(0, regexQueryNrHits("\\dog"));
+ assertEquals(1, regexQueryNrHits("493\\d32"));
+
+ assertEquals(1, regexQueryNrHits("\\wox"));
+ assertEquals(1, regexQueryNrHits("493\\w32"));
+ assertEquals(1, regexQueryNrHits("\\?\\?\\?"));
+ assertEquals(1, regexQueryNrHits("\\?\\W\\?"));
+ assertEquals(1, regexQueryNrHits("\\?\\S\\?"));
+
+ assertEquals(1, regexQueryNrHits("\\[foo\\]"));
+ assertEquals(1, regexQueryNrHits("\\[\\w{3}\\]"));
+
+ assertEquals(0, regexQueryNrHits("\\s.*")); // no matches because all whitespace stripped
+ assertEquals(1, regexQueryNrHits("\\S*ck")); //matches quick
+ assertEquals(1, regexQueryNrHits("[\\d\\.]{3,10}")); // matches 12.3
+ assertEquals(1, regexQueryNrHits("\\d{1,3}(\\.(\\d{1,2}))+")); // matches 12.3
+
+ }
+
public void testRegexComplement() throws IOException {
assertEquals(1, regexQueryNrHits("4934~[3]"));
// not the empty lang, i.e. match all docs
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java
index 7d24939c347..8fd6935e486 100644
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java
@@ -17,8 +17,12 @@
package org.apache.lucene.util.automaton;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
public class TestRegExp extends LuceneTestCase {
/**
@@ -83,4 +87,127 @@ public class TestRegExp extends LuceneTestCase {
a = new RegExp("#?").toAutomaton(1000);
assertTrue(a.toString().length() > 0);
}
+
+ public void testCoreJavaParity() {
+ // Generate random doc values and random regular expressions
+ // and check for same matching behaviour as Java's Pattern class.
+ for (int i = 0; i < 1000; i++) {
+ checkRandomExpression(randomDocValue(1 + random().nextInt(30)));
+ }
+ }
+
+ static String randomDocValue(int minLength) {
+ String charPalette = "AAAaaaBbbCccc123456 \t";
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < minLength; i++) {
+ sb.append(charPalette.charAt(randomInt(charPalette.length() - 1)));
+ }
+ return sb.toString();
+ }
+
+ private static int randomInt(int bound) {
+ return bound == 0 ? 0 : random().nextInt(bound);
+ }
+
+ protected String checkRandomExpression(String docValue) {
+ // Generate and test a random regular expression which should match the given docValue
+ StringBuilder result = new StringBuilder();
+ // Pick a part of the string to change
+ int substitutionPoint = randomInt(docValue.length() - 1);
+ int substitutionLength = 1 + randomInt(Math.min(10, docValue.length() - substitutionPoint));
+
+ // Add any head to the result, unchanged
+ if (substitutionPoint > 0) {
+ result.append(docValue.substring(0, substitutionPoint));
+ }
+
+ // Modify the middle...
+ String replacementPart = docValue.substring(substitutionPoint, substitutionPoint + substitutionLength);
+ int mutation = random().nextInt(13);
+ switch (mutation) {
+ case 0:
+ // OR with random alpha of same length
+ result.append("(" + replacementPart + "|d" + randomDocValue(replacementPart.length()) + ")");
+ break;
+ case 1:
+ // OR with non-existant value
+ result.append("(" + replacementPart + "|doesnotexist)");
+ break;
+ case 2:
+ // OR with another randomised regex (used to create nested levels of expression).
+ result.append("(" + checkRandomExpression(replacementPart) + "|doesnotexist)");
+ break;
+ case 3:
+ // Star-replace all ab sequences.
+ result.append(replacementPart.replaceAll("ab", ".*"));
+ break;
+ case 4:
+ // .-replace all b chars
+ result.append(replacementPart.replaceAll("b", "."));
+ break;
+ case 5:
+ // length-limited stars {1,2}
+ result.append(".{1," + replacementPart.length() + "}");
+ break;
+ case 6:
+ // replace all chars with .
+ result.append(replacementPart.replaceAll(".", "."));
+ break;
+ case 7:
+ // OR with uppercase chars eg [aA] (many of these sorts of expression in the wild..
+ char[] chars = replacementPart.toCharArray();
+ for (char c : chars) {
+ result.append("[" + c + Character.toUpperCase(c) + "]");
+ }
+ break;
+ case 8:
+ // NOT a character - replace all b's with "not a"
+ result.append(replacementPart.replaceAll("b", "[^a]"));
+ break;
+ case 9:
+ // Make whole part repeatable 1 or more times
+ result.append("(" + replacementPart + ")+");
+ break;
+ case 10:
+ // Make whole part repeatable 0 or more times
+ result.append("(" + replacementPart + ")?");
+ break;
+ case 11:
+ // Make any digits replaced by character class
+ result.append(replacementPart.replaceAll("\\d", "\\\\d"));
+ break;
+ case 12:
+ // Make any whitespace chars replaced by not word class
+ result.append(replacementPart.replaceAll("\\s", "\\\\W"));
+ break;
+ case 13:
+ // Make any whitespace chars replace by whitespace class
+ result.append(replacementPart.replaceAll("\\s", "\\\\s"));
+ break;
+ default:
+ break;
+ }
+ // add any remaining tail, unchanged
+ if (substitutionPoint + substitutionLength <= docValue.length() - 1) {
+ result.append(docValue.substring(substitutionPoint + substitutionLength));
+ }
+
+ String regexPattern = result.toString();
+ // Assert our randomly generated regex actually matches the provided raw input using java's expression matcher
+ Pattern pattern = Pattern.compile(regexPattern);
+ Matcher matcher = pattern.matcher(docValue);
+ assertTrue("Java regex " + regexPattern + " did not match doc value " + docValue, matcher.matches());
+
+ RegExp regex = new RegExp(regexPattern);
+ Automaton automaton = regex.toAutomaton();
+ ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton);
+ BytesRef br = new BytesRef(docValue);
+ assertTrue(
+ "[" + regexPattern + "]should match [" + docValue + "]" + substitutionPoint + "-" + substitutionLength + "/"
+ + docValue.length(),
+ bytesMatcher.run(br.bytes, br.offset, br.length)
+ );
+ return regexPattern;
+ }
+
}