mirror of https://github.com/apache/lucene.git
RegEx querying - add support for Java’s predefined character classes like \d for digits (#1489)
Supports the same \w \W \s \S \d and \D character classes as Java's Pattern matcher.
This commit is contained in:
parent
4b9808a03d
commit
1efce5444d
|
@ -290,6 +290,55 @@ import java.util.Set;
|
|||
* <td>(a single non-reserved character)</td>
|
||||
* <td></td>
|
||||
* </tr>
|
||||
*
|
||||
* <tr>
|
||||
* <td></td>
|
||||
* <td>|</td>
|
||||
* <td><code><b>\d</b></code></td>
|
||||
* <td>(a digit [0-9])</td>
|
||||
* <td></td>
|
||||
* </tr>
|
||||
*
|
||||
* <tr>
|
||||
* <td></td>
|
||||
* <td>|</td>
|
||||
* <td><code><b>\D</b></code></td>
|
||||
* <td>(a non-digit [^0-9])</td>
|
||||
* <td></td>
|
||||
* </tr>
|
||||
*
|
||||
* <tr>
|
||||
* <td></td>
|
||||
* <td>|</td>
|
||||
* <td><code><b>\s</b></code></td>
|
||||
* <td>(whitespace [ \t\n\r])</td>
|
||||
* <td></td>
|
||||
* </tr>
|
||||
*
|
||||
* <tr>
|
||||
* <td></td>
|
||||
* <td>|</td>
|
||||
* <td><code><b>\S</b></code></td>
|
||||
* <td>(non whitespace [^\s])</td>
|
||||
* <td></td>
|
||||
* </tr>
|
||||
*
|
||||
* <tr>
|
||||
* <td></td>
|
||||
* <td>|</td>
|
||||
* <td><code><b>\w</b></code></td>
|
||||
* <td>(a word character [a-zA-Z_0-9])</td>
|
||||
* <td></td>
|
||||
* </tr>
|
||||
*
|
||||
* <tr>
|
||||
* <td></td>
|
||||
* <td>|</td>
|
||||
* <td><code><b>\W</b></code></td>
|
||||
* <td>(a non word character [^\w])</td>
|
||||
* <td></td>
|
||||
* </tr>
|
||||
*
|
||||
* <tr>
|
||||
* <td></td>
|
||||
* <td>|</td>
|
||||
|
@ -317,7 +366,8 @@ import java.util.Set;
|
|||
public class RegExp {
|
||||
|
||||
enum Kind {
|
||||
REGEXP_UNION, REGEXP_CONCATENATION, REGEXP_INTERSECTION, REGEXP_OPTIONAL, REGEXP_REPEAT, REGEXP_REPEAT_MIN, REGEXP_REPEAT_MINMAX, REGEXP_COMPLEMENT, REGEXP_CHAR, REGEXP_CHAR_RANGE, REGEXP_ANYCHAR, REGEXP_EMPTY, REGEXP_STRING, REGEXP_ANYSTRING, REGEXP_AUTOMATON, REGEXP_INTERVAL
|
||||
REGEXP_UNION, REGEXP_CONCATENATION, REGEXP_INTERSECTION, REGEXP_OPTIONAL, REGEXP_REPEAT, REGEXP_REPEAT_MIN, REGEXP_REPEAT_MINMAX, REGEXP_COMPLEMENT, REGEXP_CHAR, REGEXP_CHAR_RANGE, REGEXP_ANYCHAR, REGEXP_EMPTY, REGEXP_STRING, REGEXP_ANYSTRING, REGEXP_AUTOMATON, REGEXP_INTERVAL,
|
||||
REGEXP_PRE_CLASS
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -506,6 +556,10 @@ public class RegExp {
|
|||
List<Automaton> list;
|
||||
Automaton a = null;
|
||||
switch (kind) {
|
||||
case REGEXP_PRE_CLASS:
|
||||
RegExp expanded = expandPredefined();
|
||||
a = expanded.toAutomatonInternal(automata, automaton_provider, maxDeterminizedStates);
|
||||
break;
|
||||
case REGEXP_UNION:
|
||||
list = new ArrayList<>();
|
||||
findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider,
|
||||
|
@ -716,6 +770,9 @@ public class RegExp {
|
|||
b.append('0');
|
||||
b.append(s2).append(">");
|
||||
break;
|
||||
case REGEXP_PRE_CLASS:
|
||||
b.append("\\").appendCodePoint(from);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -774,6 +831,13 @@ public class RegExp {
|
|||
b.appendCodePoint(c);
|
||||
b.append('\n');
|
||||
break;
|
||||
case REGEXP_PRE_CLASS:
|
||||
b.append(indent);
|
||||
b.append(kind);
|
||||
b.append(" class=\\");
|
||||
b.appendCodePoint(from);
|
||||
b.append('\n');
|
||||
break;
|
||||
case REGEXP_CHAR_RANGE:
|
||||
b.append(indent);
|
||||
b.append(kind);
|
||||
|
@ -1101,10 +1165,51 @@ public class RegExp {
|
|||
}
|
||||
|
||||
final RegExp parseCharClass() throws IllegalArgumentException {
|
||||
RegExp predefinedExp = matchPredefinedCharacterClass();
|
||||
if (predefinedExp != null) {
|
||||
return predefinedExp;
|
||||
}
|
||||
|
||||
int c = parseCharExp();
|
||||
if (match('-')) return makeCharRange(c, parseCharExp());
|
||||
else return makeChar(c);
|
||||
}
|
||||
|
||||
RegExp expandPredefined() {
|
||||
//See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
|
||||
switch (from) {
|
||||
case 'd':
|
||||
return new RegExp("[0-9]"); // digit
|
||||
case 'D':
|
||||
return new RegExp("[^0-9]"); // non-digit
|
||||
case 's':
|
||||
return new RegExp("[ \t\n\r]"); // whitespace
|
||||
case 'S':
|
||||
return new RegExp("[^\\s]"); // non-whitespace
|
||||
case 'w':
|
||||
return new RegExp("[a-zA-Z_0-9]"); // word
|
||||
case 'W':
|
||||
return new RegExp("[^\\w]"); // non-word
|
||||
default:
|
||||
throw new IllegalArgumentException(
|
||||
"invalid character class " + from);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
final RegExp matchPredefinedCharacterClass() {
|
||||
//See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
|
||||
if (match('\\')) {
|
||||
if (peek("dDwWsS")) {
|
||||
RegExp re =new RegExp();
|
||||
re.kind = Kind.REGEXP_PRE_CLASS;
|
||||
re.from = next();
|
||||
return re;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
final RegExp parseSimpleExp() throws IllegalArgumentException {
|
||||
if (match('.')) return makeAnyChar();
|
||||
|
@ -1158,7 +1263,13 @@ public class RegExp {
|
|||
"interval syntax error at position " + (pos - 1));
|
||||
}
|
||||
}
|
||||
} else return makeChar(parseCharExp());
|
||||
} else {
|
||||
RegExp predefined = matchPredefinedCharacterClass();
|
||||
if (predefined != null) {
|
||||
return predefined;
|
||||
}
|
||||
return makeChar(parseCharExp());
|
||||
}
|
||||
}
|
||||
|
||||
final int parseCharExp() throws IllegalArgumentException {
|
||||
|
|
|
@ -50,7 +50,7 @@ public class TestRegexpQuery extends LuceneTestCase {
|
|||
directory = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
|
||||
Document doc = new Document();
|
||||
doc.add(newTextField(FN, "the quick brown fox jumps over the lazy ??? dog 493432 49344", Field.Store.NO));
|
||||
doc.add(newTextField(FN, "the quick brown fox jumps over the lazy ??? dog 493432 49344 [foo] 12.3", Field.Store.NO));
|
||||
writer.addDocument(doc);
|
||||
reader = writer.getReader();
|
||||
writer.close();
|
||||
|
@ -90,6 +90,32 @@ public class TestRegexpQuery extends LuceneTestCase {
|
|||
assertEquals(0, regexQueryNrHits("<493433-600000>"));
|
||||
}
|
||||
|
||||
public void testCharacterClasses() throws IOException {
|
||||
assertEquals(0, regexQueryNrHits("\\d"));
|
||||
assertEquals(1, regexQueryNrHits("\\d*"));
|
||||
assertEquals(1, regexQueryNrHits("\\d{6}"));
|
||||
assertEquals(1, regexQueryNrHits("[a\\d]{6}"));
|
||||
assertEquals(1, regexQueryNrHits("\\d{2,7}"));
|
||||
assertEquals(0, regexQueryNrHits("\\d{4}"));
|
||||
assertEquals(0, regexQueryNrHits("\\dog"));
|
||||
assertEquals(1, regexQueryNrHits("493\\d32"));
|
||||
|
||||
assertEquals(1, regexQueryNrHits("\\wox"));
|
||||
assertEquals(1, regexQueryNrHits("493\\w32"));
|
||||
assertEquals(1, regexQueryNrHits("\\?\\?\\?"));
|
||||
assertEquals(1, regexQueryNrHits("\\?\\W\\?"));
|
||||
assertEquals(1, regexQueryNrHits("\\?\\S\\?"));
|
||||
|
||||
assertEquals(1, regexQueryNrHits("\\[foo\\]"));
|
||||
assertEquals(1, regexQueryNrHits("\\[\\w{3}\\]"));
|
||||
|
||||
assertEquals(0, regexQueryNrHits("\\s.*")); // no matches because all whitespace stripped
|
||||
assertEquals(1, regexQueryNrHits("\\S*ck")); //matches quick
|
||||
assertEquals(1, regexQueryNrHits("[\\d\\.]{3,10}")); // matches 12.3
|
||||
assertEquals(1, regexQueryNrHits("\\d{1,3}(\\.(\\d{1,2}))+")); // matches 12.3
|
||||
|
||||
}
|
||||
|
||||
public void testRegexComplement() throws IOException {
|
||||
assertEquals(1, regexQueryNrHits("4934~[3]"));
|
||||
// not the empty lang, i.e. match all docs
|
||||
|
|
|
@ -17,8 +17,12 @@
|
|||
package org.apache.lucene.util.automaton;
|
||||
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class TestRegExp extends LuceneTestCase {
|
||||
|
||||
/**
|
||||
|
@ -83,4 +87,127 @@ public class TestRegExp extends LuceneTestCase {
|
|||
a = new RegExp("#?").toAutomaton(1000);
|
||||
assertTrue(a.toString().length() > 0);
|
||||
}
|
||||
|
||||
public void testCoreJavaParity() {
|
||||
// Generate random doc values and random regular expressions
|
||||
// and check for same matching behaviour as Java's Pattern class.
|
||||
for (int i = 0; i < 1000; i++) {
|
||||
checkRandomExpression(randomDocValue(1 + random().nextInt(30)));
|
||||
}
|
||||
}
|
||||
|
||||
static String randomDocValue(int minLength) {
|
||||
String charPalette = "AAAaaaBbbCccc123456 \t";
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i = 0; i < minLength; i++) {
|
||||
sb.append(charPalette.charAt(randomInt(charPalette.length() - 1)));
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private static int randomInt(int bound) {
|
||||
return bound == 0 ? 0 : random().nextInt(bound);
|
||||
}
|
||||
|
||||
protected String checkRandomExpression(String docValue) {
|
||||
// Generate and test a random regular expression which should match the given docValue
|
||||
StringBuilder result = new StringBuilder();
|
||||
// Pick a part of the string to change
|
||||
int substitutionPoint = randomInt(docValue.length() - 1);
|
||||
int substitutionLength = 1 + randomInt(Math.min(10, docValue.length() - substitutionPoint));
|
||||
|
||||
// Add any head to the result, unchanged
|
||||
if (substitutionPoint > 0) {
|
||||
result.append(docValue.substring(0, substitutionPoint));
|
||||
}
|
||||
|
||||
// Modify the middle...
|
||||
String replacementPart = docValue.substring(substitutionPoint, substitutionPoint + substitutionLength);
|
||||
int mutation = random().nextInt(13);
|
||||
switch (mutation) {
|
||||
case 0:
|
||||
// OR with random alpha of same length
|
||||
result.append("(" + replacementPart + "|d" + randomDocValue(replacementPart.length()) + ")");
|
||||
break;
|
||||
case 1:
|
||||
// OR with non-existant value
|
||||
result.append("(" + replacementPart + "|doesnotexist)");
|
||||
break;
|
||||
case 2:
|
||||
// OR with another randomised regex (used to create nested levels of expression).
|
||||
result.append("(" + checkRandomExpression(replacementPart) + "|doesnotexist)");
|
||||
break;
|
||||
case 3:
|
||||
// Star-replace all ab sequences.
|
||||
result.append(replacementPart.replaceAll("ab", ".*"));
|
||||
break;
|
||||
case 4:
|
||||
// .-replace all b chars
|
||||
result.append(replacementPart.replaceAll("b", "."));
|
||||
break;
|
||||
case 5:
|
||||
// length-limited stars {1,2}
|
||||
result.append(".{1," + replacementPart.length() + "}");
|
||||
break;
|
||||
case 6:
|
||||
// replace all chars with .
|
||||
result.append(replacementPart.replaceAll(".", "."));
|
||||
break;
|
||||
case 7:
|
||||
// OR with uppercase chars eg [aA] (many of these sorts of expression in the wild..
|
||||
char[] chars = replacementPart.toCharArray();
|
||||
for (char c : chars) {
|
||||
result.append("[" + c + Character.toUpperCase(c) + "]");
|
||||
}
|
||||
break;
|
||||
case 8:
|
||||
// NOT a character - replace all b's with "not a"
|
||||
result.append(replacementPart.replaceAll("b", "[^a]"));
|
||||
break;
|
||||
case 9:
|
||||
// Make whole part repeatable 1 or more times
|
||||
result.append("(" + replacementPart + ")+");
|
||||
break;
|
||||
case 10:
|
||||
// Make whole part repeatable 0 or more times
|
||||
result.append("(" + replacementPart + ")?");
|
||||
break;
|
||||
case 11:
|
||||
// Make any digits replaced by character class
|
||||
result.append(replacementPart.replaceAll("\\d", "\\\\d"));
|
||||
break;
|
||||
case 12:
|
||||
// Make any whitespace chars replaced by not word class
|
||||
result.append(replacementPart.replaceAll("\\s", "\\\\W"));
|
||||
break;
|
||||
case 13:
|
||||
// Make any whitespace chars replace by whitespace class
|
||||
result.append(replacementPart.replaceAll("\\s", "\\\\s"));
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
// add any remaining tail, unchanged
|
||||
if (substitutionPoint + substitutionLength <= docValue.length() - 1) {
|
||||
result.append(docValue.substring(substitutionPoint + substitutionLength));
|
||||
}
|
||||
|
||||
String regexPattern = result.toString();
|
||||
// Assert our randomly generated regex actually matches the provided raw input using java's expression matcher
|
||||
Pattern pattern = Pattern.compile(regexPattern);
|
||||
Matcher matcher = pattern.matcher(docValue);
|
||||
assertTrue("Java regex " + regexPattern + " did not match doc value " + docValue, matcher.matches());
|
||||
|
||||
RegExp regex = new RegExp(regexPattern);
|
||||
Automaton automaton = regex.toAutomaton();
|
||||
ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton);
|
||||
BytesRef br = new BytesRef(docValue);
|
||||
assertTrue(
|
||||
"[" + regexPattern + "]should match [" + docValue + "]" + substitutionPoint + "-" + substitutionLength + "/"
|
||||
+ docValue.length(),
|
||||
bytesMatcher.run(br.bytes, br.offset, br.length)
|
||||
);
|
||||
return regexPattern;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue