RegEx querying - add support for Java’s predefined character classes like \d for digits (#1489)

Supports the same \w \W \s \S \d and \D character classes as Java's Pattern matcher.
This commit is contained in:
markharwood 2020-05-14 10:04:25 +01:00 committed by GitHub
parent 4b9808a03d
commit 1efce5444d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 267 additions and 3 deletions

View File

@ -290,6 +290,55 @@ import java.util.Set;
* <td>(a single non-reserved character)</td> * <td>(a single non-reserved character)</td>
* <td></td> * <td></td>
* </tr> * </tr>
*
* <tr>
* <td></td>
* <td>|</td>
* <td><code><b>\d</b></code></td>
* <td>(a digit [0-9])</td>
* <td></td>
* </tr>
*
* <tr>
* <td></td>
* <td>|</td>
* <td><code><b>\D</b></code></td>
* <td>(a non-digit [^0-9])</td>
* <td></td>
* </tr>
*
* <tr>
* <td></td>
* <td>|</td>
* <td><code><b>\s</b></code></td>
* <td>(whitespace [ \t\n\r])</td>
* <td></td>
* </tr>
*
* <tr>
* <td></td>
* <td>|</td>
* <td><code><b>\S</b></code></td>
* <td>(non whitespace [^\s])</td>
* <td></td>
* </tr>
*
* <tr>
* <td></td>
* <td>|</td>
* <td><code><b>\w</b></code></td>
* <td>(a word character [a-zA-Z_0-9])</td>
* <td></td>
* </tr>
*
* <tr>
* <td></td>
* <td>|</td>
* <td><code><b>\W</b></code></td>
* <td>(a non word character [^\w])</td>
* <td></td>
* </tr>
*
* <tr> * <tr>
* <td></td> * <td></td>
* <td>|</td> * <td>|</td>
@ -317,7 +366,8 @@ import java.util.Set;
public class RegExp { public class RegExp {
enum Kind { enum Kind {
REGEXP_UNION, REGEXP_CONCATENATION, REGEXP_INTERSECTION, REGEXP_OPTIONAL, REGEXP_REPEAT, REGEXP_REPEAT_MIN, REGEXP_REPEAT_MINMAX, REGEXP_COMPLEMENT, REGEXP_CHAR, REGEXP_CHAR_RANGE, REGEXP_ANYCHAR, REGEXP_EMPTY, REGEXP_STRING, REGEXP_ANYSTRING, REGEXP_AUTOMATON, REGEXP_INTERVAL REGEXP_UNION, REGEXP_CONCATENATION, REGEXP_INTERSECTION, REGEXP_OPTIONAL, REGEXP_REPEAT, REGEXP_REPEAT_MIN, REGEXP_REPEAT_MINMAX, REGEXP_COMPLEMENT, REGEXP_CHAR, REGEXP_CHAR_RANGE, REGEXP_ANYCHAR, REGEXP_EMPTY, REGEXP_STRING, REGEXP_ANYSTRING, REGEXP_AUTOMATON, REGEXP_INTERVAL,
REGEXP_PRE_CLASS
} }
/** /**
@ -506,6 +556,10 @@ public class RegExp {
List<Automaton> list; List<Automaton> list;
Automaton a = null; Automaton a = null;
switch (kind) { switch (kind) {
case REGEXP_PRE_CLASS:
RegExp expanded = expandPredefined();
a = expanded.toAutomatonInternal(automata, automaton_provider, maxDeterminizedStates);
break;
case REGEXP_UNION: case REGEXP_UNION:
list = new ArrayList<>(); list = new ArrayList<>();
findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider, findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider,
@ -716,6 +770,9 @@ public class RegExp {
b.append('0'); b.append('0');
b.append(s2).append(">"); b.append(s2).append(">");
break; break;
case REGEXP_PRE_CLASS:
b.append("\\").appendCodePoint(from);
break;
} }
} }
@ -774,6 +831,13 @@ public class RegExp {
b.appendCodePoint(c); b.appendCodePoint(c);
b.append('\n'); b.append('\n');
break; break;
case REGEXP_PRE_CLASS:
b.append(indent);
b.append(kind);
b.append(" class=\\");
b.appendCodePoint(from);
b.append('\n');
break;
case REGEXP_CHAR_RANGE: case REGEXP_CHAR_RANGE:
b.append(indent); b.append(indent);
b.append(kind); b.append(kind);
@ -1101,10 +1165,51 @@ public class RegExp {
} }
final RegExp parseCharClass() throws IllegalArgumentException { final RegExp parseCharClass() throws IllegalArgumentException {
RegExp predefinedExp = matchPredefinedCharacterClass();
if (predefinedExp != null) {
return predefinedExp;
}
int c = parseCharExp(); int c = parseCharExp();
if (match('-')) return makeCharRange(c, parseCharExp()); if (match('-')) return makeCharRange(c, parseCharExp());
else return makeChar(c); else return makeChar(c);
} }
RegExp expandPredefined() {
//See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
switch (from) {
case 'd':
return new RegExp("[0-9]"); // digit
case 'D':
return new RegExp("[^0-9]"); // non-digit
case 's':
return new RegExp("[ \t\n\r]"); // whitespace
case 'S':
return new RegExp("[^\\s]"); // non-whitespace
case 'w':
return new RegExp("[a-zA-Z_0-9]"); // word
case 'W':
return new RegExp("[^\\w]"); // non-word
default:
throw new IllegalArgumentException(
"invalid character class " + from);
}
}
final RegExp matchPredefinedCharacterClass() {
//See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
if (match('\\')) {
if (peek("dDwWsS")) {
RegExp re =new RegExp();
re.kind = Kind.REGEXP_PRE_CLASS;
re.from = next();
return re;
}
}
return null;
}
final RegExp parseSimpleExp() throws IllegalArgumentException { final RegExp parseSimpleExp() throws IllegalArgumentException {
if (match('.')) return makeAnyChar(); if (match('.')) return makeAnyChar();
@ -1158,7 +1263,13 @@ public class RegExp {
"interval syntax error at position " + (pos - 1)); "interval syntax error at position " + (pos - 1));
} }
} }
} else return makeChar(parseCharExp()); } else {
RegExp predefined = matchPredefinedCharacterClass();
if (predefined != null) {
return predefined;
}
return makeChar(parseCharExp());
}
} }
final int parseCharExp() throws IllegalArgumentException { final int parseCharExp() throws IllegalArgumentException {

View File

@ -50,7 +50,7 @@ public class TestRegexpQuery extends LuceneTestCase {
directory = newDirectory(); directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory); RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
Document doc = new Document(); Document doc = new Document();
doc.add(newTextField(FN, "the quick brown fox jumps over the lazy ??? dog 493432 49344", Field.Store.NO)); doc.add(newTextField(FN, "the quick brown fox jumps over the lazy ??? dog 493432 49344 [foo] 12.3", Field.Store.NO));
writer.addDocument(doc); writer.addDocument(doc);
reader = writer.getReader(); reader = writer.getReader();
writer.close(); writer.close();
@ -90,6 +90,32 @@ public class TestRegexpQuery extends LuceneTestCase {
assertEquals(0, regexQueryNrHits("<493433-600000>")); assertEquals(0, regexQueryNrHits("<493433-600000>"));
} }
public void testCharacterClasses() throws IOException {
assertEquals(0, regexQueryNrHits("\\d"));
assertEquals(1, regexQueryNrHits("\\d*"));
assertEquals(1, regexQueryNrHits("\\d{6}"));
assertEquals(1, regexQueryNrHits("[a\\d]{6}"));
assertEquals(1, regexQueryNrHits("\\d{2,7}"));
assertEquals(0, regexQueryNrHits("\\d{4}"));
assertEquals(0, regexQueryNrHits("\\dog"));
assertEquals(1, regexQueryNrHits("493\\d32"));
assertEquals(1, regexQueryNrHits("\\wox"));
assertEquals(1, regexQueryNrHits("493\\w32"));
assertEquals(1, regexQueryNrHits("\\?\\?\\?"));
assertEquals(1, regexQueryNrHits("\\?\\W\\?"));
assertEquals(1, regexQueryNrHits("\\?\\S\\?"));
assertEquals(1, regexQueryNrHits("\\[foo\\]"));
assertEquals(1, regexQueryNrHits("\\[\\w{3}\\]"));
assertEquals(0, regexQueryNrHits("\\s.*")); // no matches because all whitespace stripped
assertEquals(1, regexQueryNrHits("\\S*ck")); //matches quick
assertEquals(1, regexQueryNrHits("[\\d\\.]{3,10}")); // matches 12.3
assertEquals(1, regexQueryNrHits("\\d{1,3}(\\.(\\d{1,2}))+")); // matches 12.3
}
public void testRegexComplement() throws IOException { public void testRegexComplement() throws IOException {
assertEquals(1, regexQueryNrHits("4934~[3]")); assertEquals(1, regexQueryNrHits("4934~[3]"));
// not the empty lang, i.e. match all docs // not the empty lang, i.e. match all docs

View File

@ -17,8 +17,12 @@
package org.apache.lucene.util.automaton; package org.apache.lucene.util.automaton;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class TestRegExp extends LuceneTestCase { public class TestRegExp extends LuceneTestCase {
/** /**
@ -83,4 +87,127 @@ public class TestRegExp extends LuceneTestCase {
a = new RegExp("#?").toAutomaton(1000); a = new RegExp("#?").toAutomaton(1000);
assertTrue(a.toString().length() > 0); assertTrue(a.toString().length() > 0);
} }
public void testCoreJavaParity() {
// Generate random doc values and random regular expressions
// and check for same matching behaviour as Java's Pattern class.
for (int i = 0; i < 1000; i++) {
checkRandomExpression(randomDocValue(1 + random().nextInt(30)));
}
}
static String randomDocValue(int minLength) {
String charPalette = "AAAaaaBbbCccc123456 \t";
StringBuilder sb = new StringBuilder();
for (int i = 0; i < minLength; i++) {
sb.append(charPalette.charAt(randomInt(charPalette.length() - 1)));
}
return sb.toString();
}
private static int randomInt(int bound) {
return bound == 0 ? 0 : random().nextInt(bound);
}
protected String checkRandomExpression(String docValue) {
// Generate and test a random regular expression which should match the given docValue
StringBuilder result = new StringBuilder();
// Pick a part of the string to change
int substitutionPoint = randomInt(docValue.length() - 1);
int substitutionLength = 1 + randomInt(Math.min(10, docValue.length() - substitutionPoint));
// Add any head to the result, unchanged
if (substitutionPoint > 0) {
result.append(docValue.substring(0, substitutionPoint));
}
// Modify the middle...
String replacementPart = docValue.substring(substitutionPoint, substitutionPoint + substitutionLength);
int mutation = random().nextInt(13);
switch (mutation) {
case 0:
// OR with random alpha of same length
result.append("(" + replacementPart + "|d" + randomDocValue(replacementPart.length()) + ")");
break;
case 1:
// OR with non-existant value
result.append("(" + replacementPart + "|doesnotexist)");
break;
case 2:
// OR with another randomised regex (used to create nested levels of expression).
result.append("(" + checkRandomExpression(replacementPart) + "|doesnotexist)");
break;
case 3:
// Star-replace all ab sequences.
result.append(replacementPart.replaceAll("ab", ".*"));
break;
case 4:
// .-replace all b chars
result.append(replacementPart.replaceAll("b", "."));
break;
case 5:
// length-limited stars {1,2}
result.append(".{1," + replacementPart.length() + "}");
break;
case 6:
// replace all chars with .
result.append(replacementPart.replaceAll(".", "."));
break;
case 7:
// OR with uppercase chars eg [aA] (many of these sorts of expression in the wild..
char[] chars = replacementPart.toCharArray();
for (char c : chars) {
result.append("[" + c + Character.toUpperCase(c) + "]");
}
break;
case 8:
// NOT a character - replace all b's with "not a"
result.append(replacementPart.replaceAll("b", "[^a]"));
break;
case 9:
// Make whole part repeatable 1 or more times
result.append("(" + replacementPart + ")+");
break;
case 10:
// Make whole part repeatable 0 or more times
result.append("(" + replacementPart + ")?");
break;
case 11:
// Make any digits replaced by character class
result.append(replacementPart.replaceAll("\\d", "\\\\d"));
break;
case 12:
// Make any whitespace chars replaced by not word class
result.append(replacementPart.replaceAll("\\s", "\\\\W"));
break;
case 13:
// Make any whitespace chars replace by whitespace class
result.append(replacementPart.replaceAll("\\s", "\\\\s"));
break;
default:
break;
}
// add any remaining tail, unchanged
if (substitutionPoint + substitutionLength <= docValue.length() - 1) {
result.append(docValue.substring(substitutionPoint + substitutionLength));
}
String regexPattern = result.toString();
// Assert our randomly generated regex actually matches the provided raw input using java's expression matcher
Pattern pattern = Pattern.compile(regexPattern);
Matcher matcher = pattern.matcher(docValue);
assertTrue("Java regex " + regexPattern + " did not match doc value " + docValue, matcher.matches());
RegExp regex = new RegExp(regexPattern);
Automaton automaton = regex.toAutomaton();
ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton);
BytesRef br = new BytesRef(docValue);
assertTrue(
"[" + regexPattern + "]should match [" + docValue + "]" + substitutionPoint + "-" + substitutionLength + "/"
+ docValue.length(),
bytesMatcher.run(br.bytes, br.offset, br.length)
);
return regexPattern;
}
} }