mirror of https://github.com/apache/lucene.git
Lucene 9370: Remove any leniency around use of backslashes in expressions as per the Java Pattern policy. (#1516)
Also fixes a bug introduced in Lucene-9336 where searches for \\ would crash.
This commit is contained in:
parent
4e564079fb
commit
819e668ce2
|
@ -60,6 +60,9 @@ API Changes
|
|||
|
||||
Improvements
|
||||
|
||||
* LUCENE-9370: RegExp query is no longer lenient about inappropriate backslashes and
|
||||
follows the Java Pattern policy for rejecting illegal syntax. (Mark Harwood)
|
||||
|
||||
* LUCENE-9336: RegExp query now supports \w \W \d \D \s \S expressions.
|
||||
This is a break with previous behaviour where these were (mis)interpreted
|
||||
as literally the characters w W d etc. (Mark Harwood)
|
||||
|
|
|
@ -1,5 +1,10 @@
|
|||
# Apache Lucene Migration Guide
|
||||
|
||||
## RegExpQuery now rejects invalid backslashes (LUCENE-9370)
|
||||
|
||||
We now follow the [Java rules](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#bs) for accepting backslashes.
|
||||
Alphabetic characters other than s, S, w, W, d or D that are preceded by a backslash are considered illegal syntax and will throw an exception.
|
||||
|
||||
## RegExp certain regular expressions now match differently (LUCENE-9336)
|
||||
|
||||
The commonly used regular expressions \w \W \d \D \s and \S now work the same way [Java Pattern](https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html#CHART) matching works. Previously these expressions were (mis)interpreted as searches for the literal characters w, d, s etc.
|
||||
|
|
|
@ -1206,7 +1206,19 @@ public class RegExp {
|
|||
re.from = next();
|
||||
return re;
|
||||
}
|
||||
}
|
||||
|
||||
if (peek("\\")) {
|
||||
return makeChar(next());
|
||||
}
|
||||
|
||||
// From https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#bs
|
||||
// "It is an error to use a backslash prior to any alphabetic character that does not denote an escaped
|
||||
// construct;"
|
||||
if (peek("abcefghijklmnopqrtuvxyz") || peek("ABCEFGHIJKLMNOPQRTUVXYZ")) {
|
||||
throw new IllegalArgumentException("invalid character class \\" + next());
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
|
|
|
@ -50,7 +50,7 @@ public class TestRegexpQuery extends LuceneTestCase {
|
|||
directory = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
|
||||
Document doc = new Document();
|
||||
doc.add(newTextField(FN, "the quick brown fox jumps over the lazy ??? dog 493432 49344 [foo] 12.3", Field.Store.NO));
|
||||
doc.add(newTextField(FN, "the quick brown fox jumps over the lazy ??? dog 493432 49344 [foo] 12.3 \\", Field.Store.NO));
|
||||
writer.addDocument(doc);
|
||||
reader = writer.getReader();
|
||||
writer.close();
|
||||
|
@ -113,7 +113,16 @@ public class TestRegexpQuery extends LuceneTestCase {
|
|||
assertEquals(1, regexQueryNrHits("\\S*ck")); //matches quick
|
||||
assertEquals(1, regexQueryNrHits("[\\d\\.]{3,10}")); // matches 12.3
|
||||
assertEquals(1, regexQueryNrHits("\\d{1,3}(\\.(\\d{1,2}))+")); // matches 12.3
|
||||
|
||||
|
||||
assertEquals(1, regexQueryNrHits("\\\\"));
|
||||
assertEquals(1, regexQueryNrHits("\\\\.*"));
|
||||
|
||||
IllegalArgumentException expected = expectThrows(
|
||||
IllegalArgumentException.class, () -> {
|
||||
regexQueryNrHits("\\p");
|
||||
}
|
||||
);
|
||||
assertTrue(expected.getMessage().contains("invalid character class"));
|
||||
}
|
||||
|
||||
public void testRegexComplement() throws IOException {
|
||||
|
|
|
@ -96,6 +96,27 @@ public class TestRegExp extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public void testIllegalBackslashChars() {
|
||||
String illegalChars = "abcefghijklmnopqrtuvxyzABCEFGHIJKLMNOPQRTUVXYZ";
|
||||
for (int i = 0; i < illegalChars.length(); i++) {
|
||||
String illegalExpression = "\\" + illegalChars.charAt(i);
|
||||
IllegalArgumentException expected = expectThrows(
|
||||
IllegalArgumentException.class, () -> {
|
||||
new RegExp(illegalExpression);
|
||||
}
|
||||
);
|
||||
assertTrue(expected.getMessage().contains("invalid character class"));
|
||||
}
|
||||
}
|
||||
|
||||
public void testLegalBackslashChars() {
|
||||
String legalChars = "dDsSWw0123456789[]*&^$@!{}\\/";
|
||||
for (int i = 0; i < legalChars.length(); i++) {
|
||||
String legalExpression = "\\" + legalChars.charAt(i);
|
||||
new RegExp(legalExpression);
|
||||
}
|
||||
}
|
||||
|
||||
static String randomDocValue(int minLength) {
|
||||
String charPalette = "AAAaaaBbbCccc123456 \t";
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
|
Loading…
Reference in New Issue