diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java index e8b37e59a3f..d0445026e9c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java @@ -290,6 +290,55 @@ import java.util.Set; * (a single non-reserved character) * * + * + * + * + * | + * \d + * (a digit [0-9]) + * + * + * + * + * + * | + * \D + * (a non-digit [^0-9]) + * + * + * + * + * + * | + * \s + * (whitespace [ \t\n\r]) + * + * + * + * + * + * | + * \S + * (non whitespace [^\s]) + * + * + * + * + * + * | + * \w + * (a word character [a-zA-Z_0-9]) + * + * + * + * + * + * | + * \W + * (a non word character [^\w]) + * + * + * * * * | @@ -317,7 +366,8 @@ import java.util.Set; public class RegExp { enum Kind { - REGEXP_UNION, REGEXP_CONCATENATION, REGEXP_INTERSECTION, REGEXP_OPTIONAL, REGEXP_REPEAT, REGEXP_REPEAT_MIN, REGEXP_REPEAT_MINMAX, REGEXP_COMPLEMENT, REGEXP_CHAR, REGEXP_CHAR_RANGE, REGEXP_ANYCHAR, REGEXP_EMPTY, REGEXP_STRING, REGEXP_ANYSTRING, REGEXP_AUTOMATON, REGEXP_INTERVAL + REGEXP_UNION, REGEXP_CONCATENATION, REGEXP_INTERSECTION, REGEXP_OPTIONAL, REGEXP_REPEAT, REGEXP_REPEAT_MIN, REGEXP_REPEAT_MINMAX, REGEXP_COMPLEMENT, REGEXP_CHAR, REGEXP_CHAR_RANGE, REGEXP_ANYCHAR, REGEXP_EMPTY, REGEXP_STRING, REGEXP_ANYSTRING, REGEXP_AUTOMATON, REGEXP_INTERVAL, + REGEXP_PRE_CLASS } /** @@ -506,6 +556,10 @@ public class RegExp { List list; Automaton a = null; switch (kind) { + case REGEXP_PRE_CLASS: + RegExp expanded = expandPredefined(); + a = expanded.toAutomatonInternal(automata, automaton_provider, maxDeterminizedStates); + break; case REGEXP_UNION: list = new ArrayList<>(); findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider, @@ -716,6 +770,9 @@ public class RegExp { b.append('0'); b.append(s2).append(">"); break; + case REGEXP_PRE_CLASS: + b.append("\\").appendCodePoint(from); + break; } } @@ -774,6 +831,13 @@ public class RegExp { b.appendCodePoint(c); b.append('\n'); break; + case REGEXP_PRE_CLASS: + b.append(indent); + b.append(kind); + b.append(" class=\\"); + b.appendCodePoint(from); + b.append('\n'); + break; case REGEXP_CHAR_RANGE: b.append(indent); b.append(kind); @@ -1101,10 +1165,51 @@ public class RegExp { } final RegExp parseCharClass() throws IllegalArgumentException { + RegExp predefinedExp = matchPredefinedCharacterClass(); + if (predefinedExp != null) { + return predefinedExp; + } + int c = parseCharExp(); if (match('-')) return makeCharRange(c, parseCharExp()); else return makeChar(c); } + + RegExp expandPredefined() { + //See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html + switch (from) { + case 'd': + return new RegExp("[0-9]"); // digit + case 'D': + return new RegExp("[^0-9]"); // non-digit + case 's': + return new RegExp("[ \t\n\r]"); // whitespace + case 'S': + return new RegExp("[^\\s]"); // non-whitespace + case 'w': + return new RegExp("[a-zA-Z_0-9]"); // word + case 'W': + return new RegExp("[^\\w]"); // non-word + default: + throw new IllegalArgumentException( + "invalid character class " + from); + } + } + + + final RegExp matchPredefinedCharacterClass() { + //See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html + if (match('\\')) { + if (peek("dDwWsS")) { + RegExp re =new RegExp(); + re.kind = Kind.REGEXP_PRE_CLASS; + re.from = next(); + return re; + } + } + return null; + } + final RegExp parseSimpleExp() throws IllegalArgumentException { if (match('.')) return makeAnyChar(); @@ -1158,7 +1263,13 @@ public class RegExp { "interval syntax error at position " + (pos - 1)); } } - } else return makeChar(parseCharExp()); + } else { + RegExp predefined = matchPredefinedCharacterClass(); + if (predefined != null) { + return predefined; + } + return makeChar(parseCharExp()); + } } final int parseCharExp() throws IllegalArgumentException { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java index 6a8e183e0d4..d760a4a1680 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestRegexpQuery.java @@ -50,7 +50,7 @@ public class TestRegexpQuery extends LuceneTestCase { directory = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), directory); Document doc = new Document(); - doc.add(newTextField(FN, "the quick brown fox jumps over the lazy ??? dog 493432 49344", Field.Store.NO)); + doc.add(newTextField(FN, "the quick brown fox jumps over the lazy ??? dog 493432 49344 [foo] 12.3", Field.Store.NO)); writer.addDocument(doc); reader = writer.getReader(); writer.close(); @@ -90,6 +90,32 @@ public class TestRegexpQuery extends LuceneTestCase { assertEquals(0, regexQueryNrHits("<493433-600000>")); } + public void testCharacterClasses() throws IOException { + assertEquals(0, regexQueryNrHits("\\d")); + assertEquals(1, regexQueryNrHits("\\d*")); + assertEquals(1, regexQueryNrHits("\\d{6}")); + assertEquals(1, regexQueryNrHits("[a\\d]{6}")); + assertEquals(1, regexQueryNrHits("\\d{2,7}")); + assertEquals(0, regexQueryNrHits("\\d{4}")); + assertEquals(0, regexQueryNrHits("\\dog")); + assertEquals(1, regexQueryNrHits("493\\d32")); + + assertEquals(1, regexQueryNrHits("\\wox")); + assertEquals(1, regexQueryNrHits("493\\w32")); + assertEquals(1, regexQueryNrHits("\\?\\?\\?")); + assertEquals(1, regexQueryNrHits("\\?\\W\\?")); + assertEquals(1, regexQueryNrHits("\\?\\S\\?")); + + assertEquals(1, regexQueryNrHits("\\[foo\\]")); + assertEquals(1, regexQueryNrHits("\\[\\w{3}\\]")); + + assertEquals(0, regexQueryNrHits("\\s.*")); // no matches because all whitespace stripped + assertEquals(1, regexQueryNrHits("\\S*ck")); //matches quick + assertEquals(1, regexQueryNrHits("[\\d\\.]{3,10}")); // matches 12.3 + assertEquals(1, regexQueryNrHits("\\d{1,3}(\\.(\\d{1,2}))+")); // matches 12.3 + + } + public void testRegexComplement() throws IOException { assertEquals(1, regexQueryNrHits("4934~[3]")); // not the empty lang, i.e. match all docs diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java index 7d24939c347..8fd6935e486 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java @@ -17,8 +17,12 @@ package org.apache.lucene.util.automaton; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + public class TestRegExp extends LuceneTestCase { /** @@ -83,4 +87,127 @@ public class TestRegExp extends LuceneTestCase { a = new RegExp("#?").toAutomaton(1000); assertTrue(a.toString().length() > 0); } + + public void testCoreJavaParity() { + // Generate random doc values and random regular expressions + // and check for same matching behaviour as Java's Pattern class. + for (int i = 0; i < 1000; i++) { + checkRandomExpression(randomDocValue(1 + random().nextInt(30))); + } + } + + static String randomDocValue(int minLength) { + String charPalette = "AAAaaaBbbCccc123456 \t"; + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < minLength; i++) { + sb.append(charPalette.charAt(randomInt(charPalette.length() - 1))); + } + return sb.toString(); + } + + private static int randomInt(int bound) { + return bound == 0 ? 0 : random().nextInt(bound); + } + + protected String checkRandomExpression(String docValue) { + // Generate and test a random regular expression which should match the given docValue + StringBuilder result = new StringBuilder(); + // Pick a part of the string to change + int substitutionPoint = randomInt(docValue.length() - 1); + int substitutionLength = 1 + randomInt(Math.min(10, docValue.length() - substitutionPoint)); + + // Add any head to the result, unchanged + if (substitutionPoint > 0) { + result.append(docValue.substring(0, substitutionPoint)); + } + + // Modify the middle... + String replacementPart = docValue.substring(substitutionPoint, substitutionPoint + substitutionLength); + int mutation = random().nextInt(13); + switch (mutation) { + case 0: + // OR with random alpha of same length + result.append("(" + replacementPart + "|d" + randomDocValue(replacementPart.length()) + ")"); + break; + case 1: + // OR with non-existant value + result.append("(" + replacementPart + "|doesnotexist)"); + break; + case 2: + // OR with another randomised regex (used to create nested levels of expression). + result.append("(" + checkRandomExpression(replacementPart) + "|doesnotexist)"); + break; + case 3: + // Star-replace all ab sequences. + result.append(replacementPart.replaceAll("ab", ".*")); + break; + case 4: + // .-replace all b chars + result.append(replacementPart.replaceAll("b", ".")); + break; + case 5: + // length-limited stars {1,2} + result.append(".{1," + replacementPart.length() + "}"); + break; + case 6: + // replace all chars with . + result.append(replacementPart.replaceAll(".", ".")); + break; + case 7: + // OR with uppercase chars eg [aA] (many of these sorts of expression in the wild.. + char[] chars = replacementPart.toCharArray(); + for (char c : chars) { + result.append("[" + c + Character.toUpperCase(c) + "]"); + } + break; + case 8: + // NOT a character - replace all b's with "not a" + result.append(replacementPart.replaceAll("b", "[^a]")); + break; + case 9: + // Make whole part repeatable 1 or more times + result.append("(" + replacementPart + ")+"); + break; + case 10: + // Make whole part repeatable 0 or more times + result.append("(" + replacementPart + ")?"); + break; + case 11: + // Make any digits replaced by character class + result.append(replacementPart.replaceAll("\\d", "\\\\d")); + break; + case 12: + // Make any whitespace chars replaced by not word class + result.append(replacementPart.replaceAll("\\s", "\\\\W")); + break; + case 13: + // Make any whitespace chars replace by whitespace class + result.append(replacementPart.replaceAll("\\s", "\\\\s")); + break; + default: + break; + } + // add any remaining tail, unchanged + if (substitutionPoint + substitutionLength <= docValue.length() - 1) { + result.append(docValue.substring(substitutionPoint + substitutionLength)); + } + + String regexPattern = result.toString(); + // Assert our randomly generated regex actually matches the provided raw input using java's expression matcher + Pattern pattern = Pattern.compile(regexPattern); + Matcher matcher = pattern.matcher(docValue); + assertTrue("Java regex " + regexPattern + " did not match doc value " + docValue, matcher.matches()); + + RegExp regex = new RegExp(regexPattern); + Automaton automaton = regex.toAutomaton(); + ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton); + BytesRef br = new BytesRef(docValue); + assertTrue( + "[" + regexPattern + "]should match [" + docValue + "]" + substitutionPoint + "-" + substitutionLength + "/" + + docValue.length(), + bytesMatcher.run(br.bytes, br.offset, br.length) + ); + return regexPattern; + } + }