mirror of https://github.com/apache/lucene.git
Lucene 9370: Remove any leniency around use of backslashes in expressions as per the Java Pattern policy. (#1516)
Also fixes a bug introduced in Lucene-9336 where searches for \\ would crash.
This commit is contained in:
parent
4e564079fb
commit
819e668ce2
|
@ -60,6 +60,9 @@ API Changes
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
|
|
||||||
|
* LUCENE-9370: RegExp query is no longer lenient about inappropriate backslashes and
|
||||||
|
follows the Java Pattern policy for rejecting illegal syntax. (Mark Harwood)
|
||||||
|
|
||||||
* LUCENE-9336: RegExp query now supports \w \W \d \D \s \S expressions.
|
* LUCENE-9336: RegExp query now supports \w \W \d \D \s \S expressions.
|
||||||
This is a break with previous behaviour where these were (mis)interpreted
|
This is a break with previous behaviour where these were (mis)interpreted
|
||||||
as literally the characters w W d etc. (Mark Harwood)
|
as literally the characters w W d etc. (Mark Harwood)
|
||||||
|
|
|
@ -1,5 +1,10 @@
|
||||||
# Apache Lucene Migration Guide
|
# Apache Lucene Migration Guide
|
||||||
|
|
||||||
|
## RegExpQuery now rejects invalid backslashes (LUCENE-9370)
|
||||||
|
|
||||||
|
We now follow the [Java rules](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#bs) for accepting backslashes.
|
||||||
|
Alphabetic characters other than s, S, w, W, d or D that are preceded by a backslash are considered illegal syntax and will throw an exception.
|
||||||
|
|
||||||
## RegExp certain regular expressions now match differently (LUCENE-9336)
|
## RegExp certain regular expressions now match differently (LUCENE-9336)
|
||||||
|
|
||||||
The commonly used regular expressions \w \W \d \D \s and \S now work the same way [Java Pattern](https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html#CHART) matching works. Previously these expressions were (mis)interpreted as searches for the literal characters w, d, s etc.
|
The commonly used regular expressions \w \W \d \D \s and \S now work the same way [Java Pattern](https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html#CHART) matching works. Previously these expressions were (mis)interpreted as searches for the literal characters w, d, s etc.
|
||||||
|
|
|
@ -1206,7 +1206,19 @@ public class RegExp {
|
||||||
re.from = next();
|
re.from = next();
|
||||||
return re;
|
return re;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
if (peek("\\")) {
|
||||||
|
return makeChar(next());
|
||||||
|
}
|
||||||
|
|
||||||
|
// From https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#bs
|
||||||
|
// "It is an error to use a backslash prior to any alphabetic character that does not denote an escaped
|
||||||
|
// construct;"
|
||||||
|
if (peek("abcefghijklmnopqrtuvxyz") || peek("ABCEFGHIJKLMNOPQRTUVXYZ")) {
|
||||||
|
throw new IllegalArgumentException("invalid character class \\" + next());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -50,7 +50,7 @@ public class TestRegexpQuery extends LuceneTestCase {
|
||||||
directory = newDirectory();
|
directory = newDirectory();
|
||||||
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
|
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
|
||||||
Document doc = new Document();
|
Document doc = new Document();
|
||||||
doc.add(newTextField(FN, "the quick brown fox jumps over the lazy ??? dog 493432 49344 [foo] 12.3", Field.Store.NO));
|
doc.add(newTextField(FN, "the quick brown fox jumps over the lazy ??? dog 493432 49344 [foo] 12.3 \\", Field.Store.NO));
|
||||||
writer.addDocument(doc);
|
writer.addDocument(doc);
|
||||||
reader = writer.getReader();
|
reader = writer.getReader();
|
||||||
writer.close();
|
writer.close();
|
||||||
|
@ -113,7 +113,16 @@ public class TestRegexpQuery extends LuceneTestCase {
|
||||||
assertEquals(1, regexQueryNrHits("\\S*ck")); //matches quick
|
assertEquals(1, regexQueryNrHits("\\S*ck")); //matches quick
|
||||||
assertEquals(1, regexQueryNrHits("[\\d\\.]{3,10}")); // matches 12.3
|
assertEquals(1, regexQueryNrHits("[\\d\\.]{3,10}")); // matches 12.3
|
||||||
assertEquals(1, regexQueryNrHits("\\d{1,3}(\\.(\\d{1,2}))+")); // matches 12.3
|
assertEquals(1, regexQueryNrHits("\\d{1,3}(\\.(\\d{1,2}))+")); // matches 12.3
|
||||||
|
|
||||||
|
assertEquals(1, regexQueryNrHits("\\\\"));
|
||||||
|
assertEquals(1, regexQueryNrHits("\\\\.*"));
|
||||||
|
|
||||||
|
IllegalArgumentException expected = expectThrows(
|
||||||
|
IllegalArgumentException.class, () -> {
|
||||||
|
regexQueryNrHits("\\p");
|
||||||
|
}
|
||||||
|
);
|
||||||
|
assertTrue(expected.getMessage().contains("invalid character class"));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testRegexComplement() throws IOException {
|
public void testRegexComplement() throws IOException {
|
||||||
|
|
|
@ -96,6 +96,27 @@ public class TestRegExp extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testIllegalBackslashChars() {
|
||||||
|
String illegalChars = "abcefghijklmnopqrtuvxyzABCEFGHIJKLMNOPQRTUVXYZ";
|
||||||
|
for (int i = 0; i < illegalChars.length(); i++) {
|
||||||
|
String illegalExpression = "\\" + illegalChars.charAt(i);
|
||||||
|
IllegalArgumentException expected = expectThrows(
|
||||||
|
IllegalArgumentException.class, () -> {
|
||||||
|
new RegExp(illegalExpression);
|
||||||
|
}
|
||||||
|
);
|
||||||
|
assertTrue(expected.getMessage().contains("invalid character class"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testLegalBackslashChars() {
|
||||||
|
String legalChars = "dDsSWw0123456789[]*&^$@!{}\\/";
|
||||||
|
for (int i = 0; i < legalChars.length(); i++) {
|
||||||
|
String legalExpression = "\\" + legalChars.charAt(i);
|
||||||
|
new RegExp(legalExpression);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static String randomDocValue(int minLength) {
|
static String randomDocValue(int minLength) {
|
||||||
String charPalette = "AAAaaaBbbCccc123456 \t";
|
String charPalette = "AAAaaaBbbCccc123456 \t";
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
|
|
Loading…
Reference in New Issue