Lucene 9370: Remove any leniency around use of backslashes in expressions as per the Java Pattern policy. (#1516)

Also fixes a bug introduced in Lucene-9336 where searches for \\ would crash.
This commit is contained in:
markharwood 2020-05-15 11:45:18 +01:00 committed by GitHub
parent 4e564079fb
commit 819e668ce2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 53 additions and 3 deletions

View File

@ -60,6 +60,9 @@ API Changes
Improvements Improvements
* LUCENE-9370: RegExp query is no longer lenient about inappropriate backslashes and
follows the Java Pattern policy for rejecting illegal syntax. (Mark Harwood)
* LUCENE-9336: RegExp query now supports \w \W \d \D \s \S expressions. * LUCENE-9336: RegExp query now supports \w \W \d \D \s \S expressions.
This is a break with previous behaviour where these were (mis)interpreted This is a break with previous behaviour where these were (mis)interpreted
as literally the characters w W d etc. (Mark Harwood) as literally the characters w W d etc. (Mark Harwood)

View File

@ -1,5 +1,10 @@
# Apache Lucene Migration Guide # Apache Lucene Migration Guide
## RegExpQuery now rejects invalid backslashes (LUCENE-9370)
We now follow the [Java rules](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#bs) for accepting backslashes.
Alphabetic characters other than s, S, w, W, d or D that are preceded by a backslash are considered illegal syntax and will throw an exception.
## RegExp certain regular expressions now match differently (LUCENE-9336) ## RegExp certain regular expressions now match differently (LUCENE-9336)
The commonly used regular expressions \w \W \d \D \s and \S now work the same way [Java Pattern](https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html#CHART) matching works. Previously these expressions were (mis)interpreted as searches for the literal characters w, d, s etc. The commonly used regular expressions \w \W \d \D \s and \S now work the same way [Java Pattern](https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html#CHART) matching works. Previously these expressions were (mis)interpreted as searches for the literal characters w, d, s etc.

View File

@ -1206,7 +1206,19 @@ public class RegExp {
re.from = next(); re.from = next();
return re; return re;
} }
}
if (peek("\\")) {
return makeChar(next());
}
// From https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#bs
// "It is an error to use a backslash prior to any alphabetic character that does not denote an escaped
// construct;"
if (peek("abcefghijklmnopqrtuvxyz") || peek("ABCEFGHIJKLMNOPQRTUVXYZ")) {
throw new IllegalArgumentException("invalid character class \\" + next());
}
}
return null; return null;
} }

View File

@ -50,7 +50,7 @@ public class TestRegexpQuery extends LuceneTestCase {
directory = newDirectory(); directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory); RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
Document doc = new Document(); Document doc = new Document();
doc.add(newTextField(FN, "the quick brown fox jumps over the lazy ??? dog 493432 49344 [foo] 12.3", Field.Store.NO)); doc.add(newTextField(FN, "the quick brown fox jumps over the lazy ??? dog 493432 49344 [foo] 12.3 \\", Field.Store.NO));
writer.addDocument(doc); writer.addDocument(doc);
reader = writer.getReader(); reader = writer.getReader();
writer.close(); writer.close();
@ -113,7 +113,16 @@ public class TestRegexpQuery extends LuceneTestCase {
assertEquals(1, regexQueryNrHits("\\S*ck")); //matches quick assertEquals(1, regexQueryNrHits("\\S*ck")); //matches quick
assertEquals(1, regexQueryNrHits("[\\d\\.]{3,10}")); // matches 12.3 assertEquals(1, regexQueryNrHits("[\\d\\.]{3,10}")); // matches 12.3
assertEquals(1, regexQueryNrHits("\\d{1,3}(\\.(\\d{1,2}))+")); // matches 12.3 assertEquals(1, regexQueryNrHits("\\d{1,3}(\\.(\\d{1,2}))+")); // matches 12.3
assertEquals(1, regexQueryNrHits("\\\\"));
assertEquals(1, regexQueryNrHits("\\\\.*"));
IllegalArgumentException expected = expectThrows(
IllegalArgumentException.class, () -> {
regexQueryNrHits("\\p");
}
);
assertTrue(expected.getMessage().contains("invalid character class"));
} }
public void testRegexComplement() throws IOException { public void testRegexComplement() throws IOException {

View File

@ -96,6 +96,27 @@ public class TestRegExp extends LuceneTestCase {
} }
} }
public void testIllegalBackslashChars() {
String illegalChars = "abcefghijklmnopqrtuvxyzABCEFGHIJKLMNOPQRTUVXYZ";
for (int i = 0; i < illegalChars.length(); i++) {
String illegalExpression = "\\" + illegalChars.charAt(i);
IllegalArgumentException expected = expectThrows(
IllegalArgumentException.class, () -> {
new RegExp(illegalExpression);
}
);
assertTrue(expected.getMessage().contains("invalid character class"));
}
}
public void testLegalBackslashChars() {
String legalChars = "dDsSWw0123456789[]*&^$@!{}\\/";
for (int i = 0; i < legalChars.length(); i++) {
String legalExpression = "\\" + legalChars.charAt(i);
new RegExp(legalExpression);
}
}
static String randomDocValue(int minLength) { static String randomDocValue(int minLength) {
String charPalette = "AAAaaaBbbCccc123456 \t"; String charPalette = "AAAaaaBbbCccc123456 \t";
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();