mirror of https://github.com/apache/lucene.git
RegEx querying - add support for Java’s predefined character classes like \d for digits (#1489)
Supports the same \w \W \s \S \d and \D character classes as Java's Pattern matcher.
This commit is contained in:
parent
4b9808a03d
commit
1efce5444d
|
@ -290,6 +290,55 @@ import java.util.Set;
|
||||||
* <td>(a single non-reserved character)</td>
|
* <td>(a single non-reserved character)</td>
|
||||||
* <td></td>
|
* <td></td>
|
||||||
* </tr>
|
* </tr>
|
||||||
|
*
|
||||||
|
* <tr>
|
||||||
|
* <td></td>
|
||||||
|
* <td>|</td>
|
||||||
|
* <td><code><b>\d</b></code></td>
|
||||||
|
* <td>(a digit [0-9])</td>
|
||||||
|
* <td></td>
|
||||||
|
* </tr>
|
||||||
|
*
|
||||||
|
* <tr>
|
||||||
|
* <td></td>
|
||||||
|
* <td>|</td>
|
||||||
|
* <td><code><b>\D</b></code></td>
|
||||||
|
* <td>(a non-digit [^0-9])</td>
|
||||||
|
* <td></td>
|
||||||
|
* </tr>
|
||||||
|
*
|
||||||
|
* <tr>
|
||||||
|
* <td></td>
|
||||||
|
* <td>|</td>
|
||||||
|
* <td><code><b>\s</b></code></td>
|
||||||
|
* <td>(whitespace [ \t\n\r])</td>
|
||||||
|
* <td></td>
|
||||||
|
* </tr>
|
||||||
|
*
|
||||||
|
* <tr>
|
||||||
|
* <td></td>
|
||||||
|
* <td>|</td>
|
||||||
|
* <td><code><b>\S</b></code></td>
|
||||||
|
* <td>(non whitespace [^\s])</td>
|
||||||
|
* <td></td>
|
||||||
|
* </tr>
|
||||||
|
*
|
||||||
|
* <tr>
|
||||||
|
* <td></td>
|
||||||
|
* <td>|</td>
|
||||||
|
* <td><code><b>\w</b></code></td>
|
||||||
|
* <td>(a word character [a-zA-Z_0-9])</td>
|
||||||
|
* <td></td>
|
||||||
|
* </tr>
|
||||||
|
*
|
||||||
|
* <tr>
|
||||||
|
* <td></td>
|
||||||
|
* <td>|</td>
|
||||||
|
* <td><code><b>\W</b></code></td>
|
||||||
|
* <td>(a non word character [^\w])</td>
|
||||||
|
* <td></td>
|
||||||
|
* </tr>
|
||||||
|
*
|
||||||
* <tr>
|
* <tr>
|
||||||
* <td></td>
|
* <td></td>
|
||||||
* <td>|</td>
|
* <td>|</td>
|
||||||
|
@ -317,7 +366,8 @@ import java.util.Set;
|
||||||
public class RegExp {
|
public class RegExp {
|
||||||
|
|
||||||
enum Kind {
|
enum Kind {
|
||||||
REGEXP_UNION, REGEXP_CONCATENATION, REGEXP_INTERSECTION, REGEXP_OPTIONAL, REGEXP_REPEAT, REGEXP_REPEAT_MIN, REGEXP_REPEAT_MINMAX, REGEXP_COMPLEMENT, REGEXP_CHAR, REGEXP_CHAR_RANGE, REGEXP_ANYCHAR, REGEXP_EMPTY, REGEXP_STRING, REGEXP_ANYSTRING, REGEXP_AUTOMATON, REGEXP_INTERVAL
|
REGEXP_UNION, REGEXP_CONCATENATION, REGEXP_INTERSECTION, REGEXP_OPTIONAL, REGEXP_REPEAT, REGEXP_REPEAT_MIN, REGEXP_REPEAT_MINMAX, REGEXP_COMPLEMENT, REGEXP_CHAR, REGEXP_CHAR_RANGE, REGEXP_ANYCHAR, REGEXP_EMPTY, REGEXP_STRING, REGEXP_ANYSTRING, REGEXP_AUTOMATON, REGEXP_INTERVAL,
|
||||||
|
REGEXP_PRE_CLASS
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -506,6 +556,10 @@ public class RegExp {
|
||||||
List<Automaton> list;
|
List<Automaton> list;
|
||||||
Automaton a = null;
|
Automaton a = null;
|
||||||
switch (kind) {
|
switch (kind) {
|
||||||
|
case REGEXP_PRE_CLASS:
|
||||||
|
RegExp expanded = expandPredefined();
|
||||||
|
a = expanded.toAutomatonInternal(automata, automaton_provider, maxDeterminizedStates);
|
||||||
|
break;
|
||||||
case REGEXP_UNION:
|
case REGEXP_UNION:
|
||||||
list = new ArrayList<>();
|
list = new ArrayList<>();
|
||||||
findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider,
|
findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider,
|
||||||
|
@ -716,6 +770,9 @@ public class RegExp {
|
||||||
b.append('0');
|
b.append('0');
|
||||||
b.append(s2).append(">");
|
b.append(s2).append(">");
|
||||||
break;
|
break;
|
||||||
|
case REGEXP_PRE_CLASS:
|
||||||
|
b.append("\\").appendCodePoint(from);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -774,6 +831,13 @@ public class RegExp {
|
||||||
b.appendCodePoint(c);
|
b.appendCodePoint(c);
|
||||||
b.append('\n');
|
b.append('\n');
|
||||||
break;
|
break;
|
||||||
|
case REGEXP_PRE_CLASS:
|
||||||
|
b.append(indent);
|
||||||
|
b.append(kind);
|
||||||
|
b.append(" class=\\");
|
||||||
|
b.appendCodePoint(from);
|
||||||
|
b.append('\n');
|
||||||
|
break;
|
||||||
case REGEXP_CHAR_RANGE:
|
case REGEXP_CHAR_RANGE:
|
||||||
b.append(indent);
|
b.append(indent);
|
||||||
b.append(kind);
|
b.append(kind);
|
||||||
|
@ -1101,10 +1165,51 @@ public class RegExp {
|
||||||
}
|
}
|
||||||
|
|
||||||
final RegExp parseCharClass() throws IllegalArgumentException {
|
final RegExp parseCharClass() throws IllegalArgumentException {
|
||||||
|
RegExp predefinedExp = matchPredefinedCharacterClass();
|
||||||
|
if (predefinedExp != null) {
|
||||||
|
return predefinedExp;
|
||||||
|
}
|
||||||
|
|
||||||
int c = parseCharExp();
|
int c = parseCharExp();
|
||||||
if (match('-')) return makeCharRange(c, parseCharExp());
|
if (match('-')) return makeCharRange(c, parseCharExp());
|
||||||
else return makeChar(c);
|
else return makeChar(c);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
RegExp expandPredefined() {
|
||||||
|
//See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
|
||||||
|
switch (from) {
|
||||||
|
case 'd':
|
||||||
|
return new RegExp("[0-9]"); // digit
|
||||||
|
case 'D':
|
||||||
|
return new RegExp("[^0-9]"); // non-digit
|
||||||
|
case 's':
|
||||||
|
return new RegExp("[ \t\n\r]"); // whitespace
|
||||||
|
case 'S':
|
||||||
|
return new RegExp("[^\\s]"); // non-whitespace
|
||||||
|
case 'w':
|
||||||
|
return new RegExp("[a-zA-Z_0-9]"); // word
|
||||||
|
case 'W':
|
||||||
|
return new RegExp("[^\\w]"); // non-word
|
||||||
|
default:
|
||||||
|
throw new IllegalArgumentException(
|
||||||
|
"invalid character class " + from);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
final RegExp matchPredefinedCharacterClass() {
|
||||||
|
//See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
|
||||||
|
if (match('\\')) {
|
||||||
|
if (peek("dDwWsS")) {
|
||||||
|
RegExp re =new RegExp();
|
||||||
|
re.kind = Kind.REGEXP_PRE_CLASS;
|
||||||
|
re.from = next();
|
||||||
|
return re;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
final RegExp parseSimpleExp() throws IllegalArgumentException {
|
final RegExp parseSimpleExp() throws IllegalArgumentException {
|
||||||
if (match('.')) return makeAnyChar();
|
if (match('.')) return makeAnyChar();
|
||||||
|
@ -1158,7 +1263,13 @@ public class RegExp {
|
||||||
"interval syntax error at position " + (pos - 1));
|
"interval syntax error at position " + (pos - 1));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else return makeChar(parseCharExp());
|
} else {
|
||||||
|
RegExp predefined = matchPredefinedCharacterClass();
|
||||||
|
if (predefined != null) {
|
||||||
|
return predefined;
|
||||||
|
}
|
||||||
|
return makeChar(parseCharExp());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final int parseCharExp() throws IllegalArgumentException {
|
final int parseCharExp() throws IllegalArgumentException {
|
||||||
|
|
|
@ -50,7 +50,7 @@ public class TestRegexpQuery extends LuceneTestCase {
|
||||||
directory = newDirectory();
|
directory = newDirectory();
|
||||||
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
|
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
|
||||||
Document doc = new Document();
|
Document doc = new Document();
|
||||||
doc.add(newTextField(FN, "the quick brown fox jumps over the lazy ??? dog 493432 49344", Field.Store.NO));
|
doc.add(newTextField(FN, "the quick brown fox jumps over the lazy ??? dog 493432 49344 [foo] 12.3", Field.Store.NO));
|
||||||
writer.addDocument(doc);
|
writer.addDocument(doc);
|
||||||
reader = writer.getReader();
|
reader = writer.getReader();
|
||||||
writer.close();
|
writer.close();
|
||||||
|
@ -90,6 +90,32 @@ public class TestRegexpQuery extends LuceneTestCase {
|
||||||
assertEquals(0, regexQueryNrHits("<493433-600000>"));
|
assertEquals(0, regexQueryNrHits("<493433-600000>"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testCharacterClasses() throws IOException {
|
||||||
|
assertEquals(0, regexQueryNrHits("\\d"));
|
||||||
|
assertEquals(1, regexQueryNrHits("\\d*"));
|
||||||
|
assertEquals(1, regexQueryNrHits("\\d{6}"));
|
||||||
|
assertEquals(1, regexQueryNrHits("[a\\d]{6}"));
|
||||||
|
assertEquals(1, regexQueryNrHits("\\d{2,7}"));
|
||||||
|
assertEquals(0, regexQueryNrHits("\\d{4}"));
|
||||||
|
assertEquals(0, regexQueryNrHits("\\dog"));
|
||||||
|
assertEquals(1, regexQueryNrHits("493\\d32"));
|
||||||
|
|
||||||
|
assertEquals(1, regexQueryNrHits("\\wox"));
|
||||||
|
assertEquals(1, regexQueryNrHits("493\\w32"));
|
||||||
|
assertEquals(1, regexQueryNrHits("\\?\\?\\?"));
|
||||||
|
assertEquals(1, regexQueryNrHits("\\?\\W\\?"));
|
||||||
|
assertEquals(1, regexQueryNrHits("\\?\\S\\?"));
|
||||||
|
|
||||||
|
assertEquals(1, regexQueryNrHits("\\[foo\\]"));
|
||||||
|
assertEquals(1, regexQueryNrHits("\\[\\w{3}\\]"));
|
||||||
|
|
||||||
|
assertEquals(0, regexQueryNrHits("\\s.*")); // no matches because all whitespace stripped
|
||||||
|
assertEquals(1, regexQueryNrHits("\\S*ck")); //matches quick
|
||||||
|
assertEquals(1, regexQueryNrHits("[\\d\\.]{3,10}")); // matches 12.3
|
||||||
|
assertEquals(1, regexQueryNrHits("\\d{1,3}(\\.(\\d{1,2}))+")); // matches 12.3
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
public void testRegexComplement() throws IOException {
|
public void testRegexComplement() throws IOException {
|
||||||
assertEquals(1, regexQueryNrHits("4934~[3]"));
|
assertEquals(1, regexQueryNrHits("4934~[3]"));
|
||||||
// not the empty lang, i.e. match all docs
|
// not the empty lang, i.e. match all docs
|
||||||
|
|
|
@ -17,8 +17,12 @@
|
||||||
package org.apache.lucene.util.automaton;
|
package org.apache.lucene.util.automaton;
|
||||||
|
|
||||||
|
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
public class TestRegExp extends LuceneTestCase {
|
public class TestRegExp extends LuceneTestCase {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -83,4 +87,127 @@ public class TestRegExp extends LuceneTestCase {
|
||||||
a = new RegExp("#?").toAutomaton(1000);
|
a = new RegExp("#?").toAutomaton(1000);
|
||||||
assertTrue(a.toString().length() > 0);
|
assertTrue(a.toString().length() > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testCoreJavaParity() {
|
||||||
|
// Generate random doc values and random regular expressions
|
||||||
|
// and check for same matching behaviour as Java's Pattern class.
|
||||||
|
for (int i = 0; i < 1000; i++) {
|
||||||
|
checkRandomExpression(randomDocValue(1 + random().nextInt(30)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static String randomDocValue(int minLength) {
|
||||||
|
String charPalette = "AAAaaaBbbCccc123456 \t";
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
for (int i = 0; i < minLength; i++) {
|
||||||
|
sb.append(charPalette.charAt(randomInt(charPalette.length() - 1)));
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int randomInt(int bound) {
|
||||||
|
return bound == 0 ? 0 : random().nextInt(bound);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected String checkRandomExpression(String docValue) {
|
||||||
|
// Generate and test a random regular expression which should match the given docValue
|
||||||
|
StringBuilder result = new StringBuilder();
|
||||||
|
// Pick a part of the string to change
|
||||||
|
int substitutionPoint = randomInt(docValue.length() - 1);
|
||||||
|
int substitutionLength = 1 + randomInt(Math.min(10, docValue.length() - substitutionPoint));
|
||||||
|
|
||||||
|
// Add any head to the result, unchanged
|
||||||
|
if (substitutionPoint > 0) {
|
||||||
|
result.append(docValue.substring(0, substitutionPoint));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Modify the middle...
|
||||||
|
String replacementPart = docValue.substring(substitutionPoint, substitutionPoint + substitutionLength);
|
||||||
|
int mutation = random().nextInt(13);
|
||||||
|
switch (mutation) {
|
||||||
|
case 0:
|
||||||
|
// OR with random alpha of same length
|
||||||
|
result.append("(" + replacementPart + "|d" + randomDocValue(replacementPart.length()) + ")");
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
// OR with non-existant value
|
||||||
|
result.append("(" + replacementPart + "|doesnotexist)");
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
// OR with another randomised regex (used to create nested levels of expression).
|
||||||
|
result.append("(" + checkRandomExpression(replacementPart) + "|doesnotexist)");
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
// Star-replace all ab sequences.
|
||||||
|
result.append(replacementPart.replaceAll("ab", ".*"));
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
// .-replace all b chars
|
||||||
|
result.append(replacementPart.replaceAll("b", "."));
|
||||||
|
break;
|
||||||
|
case 5:
|
||||||
|
// length-limited stars {1,2}
|
||||||
|
result.append(".{1," + replacementPart.length() + "}");
|
||||||
|
break;
|
||||||
|
case 6:
|
||||||
|
// replace all chars with .
|
||||||
|
result.append(replacementPart.replaceAll(".", "."));
|
||||||
|
break;
|
||||||
|
case 7:
|
||||||
|
// OR with uppercase chars eg [aA] (many of these sorts of expression in the wild..
|
||||||
|
char[] chars = replacementPart.toCharArray();
|
||||||
|
for (char c : chars) {
|
||||||
|
result.append("[" + c + Character.toUpperCase(c) + "]");
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 8:
|
||||||
|
// NOT a character - replace all b's with "not a"
|
||||||
|
result.append(replacementPart.replaceAll("b", "[^a]"));
|
||||||
|
break;
|
||||||
|
case 9:
|
||||||
|
// Make whole part repeatable 1 or more times
|
||||||
|
result.append("(" + replacementPart + ")+");
|
||||||
|
break;
|
||||||
|
case 10:
|
||||||
|
// Make whole part repeatable 0 or more times
|
||||||
|
result.append("(" + replacementPart + ")?");
|
||||||
|
break;
|
||||||
|
case 11:
|
||||||
|
// Make any digits replaced by character class
|
||||||
|
result.append(replacementPart.replaceAll("\\d", "\\\\d"));
|
||||||
|
break;
|
||||||
|
case 12:
|
||||||
|
// Make any whitespace chars replaced by not word class
|
||||||
|
result.append(replacementPart.replaceAll("\\s", "\\\\W"));
|
||||||
|
break;
|
||||||
|
case 13:
|
||||||
|
// Make any whitespace chars replace by whitespace class
|
||||||
|
result.append(replacementPart.replaceAll("\\s", "\\\\s"));
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// add any remaining tail, unchanged
|
||||||
|
if (substitutionPoint + substitutionLength <= docValue.length() - 1) {
|
||||||
|
result.append(docValue.substring(substitutionPoint + substitutionLength));
|
||||||
|
}
|
||||||
|
|
||||||
|
String regexPattern = result.toString();
|
||||||
|
// Assert our randomly generated regex actually matches the provided raw input using java's expression matcher
|
||||||
|
Pattern pattern = Pattern.compile(regexPattern);
|
||||||
|
Matcher matcher = pattern.matcher(docValue);
|
||||||
|
assertTrue("Java regex " + regexPattern + " did not match doc value " + docValue, matcher.matches());
|
||||||
|
|
||||||
|
RegExp regex = new RegExp(regexPattern);
|
||||||
|
Automaton automaton = regex.toAutomaton();
|
||||||
|
ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton);
|
||||||
|
BytesRef br = new BytesRef(docValue);
|
||||||
|
assertTrue(
|
||||||
|
"[" + regexPattern + "]should match [" + docValue + "]" + substitutionPoint + "-" + substitutionLength + "/"
|
||||||
|
+ docValue.length(),
|
||||||
|
bytesMatcher.run(br.bytes, br.offset, br.length)
|
||||||
|
);
|
||||||
|
return regexPattern;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue