Painless: Fix regex lexer and error messages (#23634)
Without this change, if write a script with multiple regexes *sometimes* the lexer will decide to look at them like one big regex and then some trailing garbage. Like this discuss post: https://discuss.elastic.co/t/error-with-the-split-function-in-painless-script/79021 ``` def val = /\\\\/.split(ctx._source.event_data.param17); if (val[2] =~ /\\./) { def val2 = /\\./.split(val[2]); ctx._source['user_crash'] = val2[0] } else { ctx._source['user_crash'] = val[2] } ``` The error message you get from the lexer is `lexer_no_viable_alt_exception` right after the *second* regex. With this change each regex is just a single regex like it ought to be. As a bonus, while looking into this issue I found that the error reporting for regexes wasn't very nice. If you specify an invalid pattern then you get an error marker on the start of the pattern with the JVM's regex error message which attempts to point you to the location in the regex but is totally unreadable in the JSON response. This change fixes the location to point to the appropriate spot inside the pattern and removes the portion of the JVM's error message that doesn't render well. It is no longer needed now that we point users to the appropriate spot in the pattern.
This commit is contained in:
parent
490d29f4fc
commit
257a7d77ed
|
@ -120,7 +120,7 @@ INTEGER: ( '0' | [1-9] [0-9]* ) [lLfFdD]?;
|
|||
DECIMAL: ( '0' | [1-9] [0-9]* ) (DOT [0-9]+)? ( [eE] [+\-]? [0-9]+ )? [fFdD]?;
|
||||
|
||||
STRING: ( '"' ( '\\"' | '\\\\' | ~[\\"] )*? '"' ) | ( '\'' ( '\\\'' | '\\\\' | ~[\\'] )*? '\'' );
|
||||
REGEX: '/' ( ~('/' | '\n') | '\\' ~'\n' )+ '/' [cilmsUux]* { slashIsRegex() }?;
|
||||
REGEX: '/' ( '\\' ~'\n' | ~('/' | '\n') )+? '/' [cilmsUux]* { slashIsRegex() }?;
|
||||
|
||||
TRUE: 'true';
|
||||
FALSE: 'false';
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
// ANTLR GENERATED CODE: DO NOT EDIT
|
||||
package org.elasticsearch.painless.antlr;
|
||||
|
||||
|
||||
import org.antlr.v4.runtime.Lexer;
|
||||
import org.antlr.v4.runtime.CharStream;
|
||||
import org.antlr.v4.runtime.Token;
|
||||
|
@ -211,29 +209,29 @@ abstract class PainlessLexer extends Lexer {
|
|||
"\3N\3N\7N\u021a\nN\fN\16N\u021d\13N\3N\3N\3O\3O\3O\3O\3O\3P\3P\3P\3P\3"+
|
||||
"P\3P\3Q\3Q\3Q\3Q\3Q\3R\3R\3R\3R\7R\u0235\nR\fR\16R\u0238\13R\3R\3R\3S"+
|
||||
"\3S\7S\u023e\nS\fS\16S\u0241\13S\3T\3T\3T\7T\u0246\nT\fT\16T\u0249\13"+
|
||||
"T\5T\u024b\nT\3T\3T\3U\3U\7U\u0251\nU\fU\16U\u0254\13U\3U\3U\6\u00b9\u00c3"+
|
||||
"\u01fd\u0209\2V\4\3\6\4\b\5\n\6\f\7\16\b\20\t\22\n\24\13\26\f\30\r\32"+
|
||||
"\16\34\17\36\20 \21\"\22$\23&\24(\25*\26,\27.\30\60\31\62\32\64\33\66"+
|
||||
"\348\35:\36<\37> @!B\"D#F$H%J&L\'N(P)R*T+V,X-Z.\\/^\60`\61b\62d\63f\64"+
|
||||
"h\65j\66l\67n8p9r:t;v<x=z>|?~@\u0080A\u0082B\u0084C\u0086D\u0088E\u008a"+
|
||||
"F\u008cG\u008eH\u0090I\u0092J\u0094K\u0096L\u0098M\u009aN\u009cO\u009e"+
|
||||
"P\u00a0Q\u00a2R\u00a4S\u00a6T\u00a8U\u00aaV\4\2\3\25\5\2\13\f\17\17\""+
|
||||
"\"\4\2\f\f\17\17\3\2\629\4\2NNnn\4\2ZZzz\5\2\62;CHch\3\2\63;\3\2\62;\b"+
|
||||
"\2FFHHNNffhhnn\4\2GGgg\4\2--//\6\2FFHHffhh\4\2$$^^\4\2))^^\4\2\f\f\61"+
|
||||
"\61\3\2\f\f\t\2WWeekknouuwwzz\5\2C\\aac|\6\2\62;C\\aac|\u0277\2\4\3\2"+
|
||||
"\2\2\2\6\3\2\2\2\2\b\3\2\2\2\2\n\3\2\2\2\2\f\3\2\2\2\2\16\3\2\2\2\2\20"+
|
||||
"\3\2\2\2\2\22\3\2\2\2\2\24\3\2\2\2\2\26\3\2\2\2\2\30\3\2\2\2\2\32\3\2"+
|
||||
"\2\2\2\34\3\2\2\2\2\36\3\2\2\2\2 \3\2\2\2\2\"\3\2\2\2\2$\3\2\2\2\2&\3"+
|
||||
"\2\2\2\2(\3\2\2\2\2*\3\2\2\2\2,\3\2\2\2\2.\3\2\2\2\2\60\3\2\2\2\2\62\3"+
|
||||
"\2\2\2\2\64\3\2\2\2\2\66\3\2\2\2\28\3\2\2\2\2:\3\2\2\2\2<\3\2\2\2\2>\3"+
|
||||
"\2\2\2\2@\3\2\2\2\2B\3\2\2\2\2D\3\2\2\2\2F\3\2\2\2\2H\3\2\2\2\2J\3\2\2"+
|
||||
"\2\2L\3\2\2\2\2N\3\2\2\2\2P\3\2\2\2\2R\3\2\2\2\2T\3\2\2\2\2V\3\2\2\2\2"+
|
||||
"X\3\2\2\2\2Z\3\2\2\2\2\\\3\2\2\2\2^\3\2\2\2\2`\3\2\2\2\2b\3\2\2\2\2d\3"+
|
||||
"\2\2\2\2f\3\2\2\2\2h\3\2\2\2\2j\3\2\2\2\2l\3\2\2\2\2n\3\2\2\2\2p\3\2\2"+
|
||||
"\2\2r\3\2\2\2\2t\3\2\2\2\2v\3\2\2\2\2x\3\2\2\2\2z\3\2\2\2\2|\3\2\2\2\2"+
|
||||
"~\3\2\2\2\2\u0080\3\2\2\2\2\u0082\3\2\2\2\2\u0084\3\2\2\2\2\u0086\3\2"+
|
||||
"\2\2\2\u0088\3\2\2\2\2\u008a\3\2\2\2\2\u008c\3\2\2\2\2\u008e\3\2\2\2\2"+
|
||||
"\u0090\3\2\2\2\2\u0092\3\2\2\2\2\u0094\3\2\2\2\2\u0096\3\2\2\2\2\u0098"+
|
||||
"T\5T\u024b\nT\3T\3T\3U\3U\7U\u0251\nU\fU\16U\u0254\13U\3U\3U\7\u00b9\u00c3"+
|
||||
"\u01fd\u0209\u0215\2V\4\3\6\4\b\5\n\6\f\7\16\b\20\t\22\n\24\13\26\f\30"+
|
||||
"\r\32\16\34\17\36\20 \21\"\22$\23&\24(\25*\26,\27.\30\60\31\62\32\64\33"+
|
||||
"\66\348\35:\36<\37> @!B\"D#F$H%J&L\'N(P)R*T+V,X-Z.\\/^\60`\61b\62d\63"+
|
||||
"f\64h\65j\66l\67n8p9r:t;v<x=z>|?~@\u0080A\u0082B\u0084C\u0086D\u0088E"+
|
||||
"\u008aF\u008cG\u008eH\u0090I\u0092J\u0094K\u0096L\u0098M\u009aN\u009c"+
|
||||
"O\u009eP\u00a0Q\u00a2R\u00a4S\u00a6T\u00a8U\u00aaV\4\2\3\25\5\2\13\f\17"+
|
||||
"\17\"\"\4\2\f\f\17\17\3\2\629\4\2NNnn\4\2ZZzz\5\2\62;CHch\3\2\63;\3\2"+
|
||||
"\62;\b\2FFHHNNffhhnn\4\2GGgg\4\2--//\6\2FFHHffhh\4\2$$^^\4\2))^^\3\2\f"+
|
||||
"\f\4\2\f\f\61\61\t\2WWeekknouuwwzz\5\2C\\aac|\6\2\62;C\\aac|\u0277\2\4"+
|
||||
"\3\2\2\2\2\6\3\2\2\2\2\b\3\2\2\2\2\n\3\2\2\2\2\f\3\2\2\2\2\16\3\2\2\2"+
|
||||
"\2\20\3\2\2\2\2\22\3\2\2\2\2\24\3\2\2\2\2\26\3\2\2\2\2\30\3\2\2\2\2\32"+
|
||||
"\3\2\2\2\2\34\3\2\2\2\2\36\3\2\2\2\2 \3\2\2\2\2\"\3\2\2\2\2$\3\2\2\2\2"+
|
||||
"&\3\2\2\2\2(\3\2\2\2\2*\3\2\2\2\2,\3\2\2\2\2.\3\2\2\2\2\60\3\2\2\2\2\62"+
|
||||
"\3\2\2\2\2\64\3\2\2\2\2\66\3\2\2\2\28\3\2\2\2\2:\3\2\2\2\2<\3\2\2\2\2"+
|
||||
">\3\2\2\2\2@\3\2\2\2\2B\3\2\2\2\2D\3\2\2\2\2F\3\2\2\2\2H\3\2\2\2\2J\3"+
|
||||
"\2\2\2\2L\3\2\2\2\2N\3\2\2\2\2P\3\2\2\2\2R\3\2\2\2\2T\3\2\2\2\2V\3\2\2"+
|
||||
"\2\2X\3\2\2\2\2Z\3\2\2\2\2\\\3\2\2\2\2^\3\2\2\2\2`\3\2\2\2\2b\3\2\2\2"+
|
||||
"\2d\3\2\2\2\2f\3\2\2\2\2h\3\2\2\2\2j\3\2\2\2\2l\3\2\2\2\2n\3\2\2\2\2p"+
|
||||
"\3\2\2\2\2r\3\2\2\2\2t\3\2\2\2\2v\3\2\2\2\2x\3\2\2\2\2z\3\2\2\2\2|\3\2"+
|
||||
"\2\2\2~\3\2\2\2\2\u0080\3\2\2\2\2\u0082\3\2\2\2\2\u0084\3\2\2\2\2\u0086"+
|
||||
"\3\2\2\2\2\u0088\3\2\2\2\2\u008a\3\2\2\2\2\u008c\3\2\2\2\2\u008e\3\2\2"+
|
||||
"\2\2\u0090\3\2\2\2\2\u0092\3\2\2\2\2\u0094\3\2\2\2\2\u0096\3\2\2\2\2\u0098"+
|
||||
"\3\2\2\2\2\u009a\3\2\2\2\2\u009c\3\2\2\2\2\u009e\3\2\2\2\2\u00a0\3\2\2"+
|
||||
"\2\2\u00a2\3\2\2\2\2\u00a4\3\2\2\2\2\u00a6\3\2\2\2\3\u00a8\3\2\2\2\3\u00aa"+
|
||||
"\3\2\2\2\4\u00ad\3\2\2\2\6\u00c8\3\2\2\2\b\u00cc\3\2\2\2\n\u00ce\3\2\2"+
|
||||
|
@ -358,9 +356,9 @@ abstract class PainlessLexer extends Lexer {
|
|||
"\3\2\2\2\u0207\u0206\3\2\2\2\u0208\u020b\3\2\2\2\u0209\u020a\3\2\2\2\u0209"+
|
||||
"\u0207\3\2\2\2\u020a\u020c\3\2\2\2\u020b\u0209\3\2\2\2\u020c\u020e\7)"+
|
||||
"\2\2\u020d\u01f5\3\2\2\2\u020d\u0201\3\2\2\2\u020e\u009b\3\2\2\2\u020f"+
|
||||
"\u0213\7\61\2\2\u0210\u0214\n\20\2\2\u0211\u0212\7^\2\2\u0212\u0214\n"+
|
||||
"\21\2\2\u0213\u0210\3\2\2\2\u0213\u0211\3\2\2\2\u0214\u0215\3\2\2\2\u0215"+
|
||||
"\u0213\3\2\2\2\u0215\u0216\3\2\2\2\u0216\u0217\3\2\2\2\u0217\u021b\7\61"+
|
||||
"\u0213\7\61\2\2\u0210\u0211\7^\2\2\u0211\u0214\n\20\2\2\u0212\u0214\n"+
|
||||
"\21\2\2\u0213\u0210\3\2\2\2\u0213\u0212\3\2\2\2\u0214\u0215\3\2\2\2\u0215"+
|
||||
"\u0216\3\2\2\2\u0215\u0213\3\2\2\2\u0216\u0217\3\2\2\2\u0217\u021b\7\61"+
|
||||
"\2\2\u0218\u021a\t\22\2\2\u0219\u0218\3\2\2\2\u021a\u021d\3\2\2\2\u021b"+
|
||||
"\u0219\3\2\2\2\u021b\u021c\3\2\2\2\u021c\u021e\3\2\2\2\u021d\u021b\3\2"+
|
||||
"\2\2\u021e\u021f\6N\3\2\u021f\u009d\3\2\2\2\u0220\u0221\7v\2\2\u0221\u0222"+
|
||||
|
|
|
@ -68,8 +68,9 @@ public final class ERegex extends AExpression {
|
|||
|
||||
try {
|
||||
Pattern.compile(pattern, flags);
|
||||
} catch (PatternSyntaxException exception) {
|
||||
throw createError(exception);
|
||||
} catch (PatternSyntaxException e) {
|
||||
throw new Location(location.getSourceName(), location.getOffset() + 1 + e.getIndex()).createError(
|
||||
new IllegalArgumentException("Error compiling regex: " + e.getDescription()));
|
||||
}
|
||||
|
||||
constant = new Constant(location, Definition.PATTERN_TYPE.type, "regexAt$" + location.getOffset(), this::initializeConstant);
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
package org.elasticsearch.painless;
|
||||
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.script.ScriptException;
|
||||
|
||||
import java.nio.CharBuffer;
|
||||
import java.util.Arrays;
|
||||
|
@ -44,8 +45,17 @@ public class RegexTests extends ScriptTestCase {
|
|||
assertEquals(false, exec("return 'bar' ==~ /foo/"));
|
||||
}
|
||||
|
||||
public void testSlashesEscapePattern() {
|
||||
assertEquals(true, exec("return '//' ==~ /\\/\\//"));
|
||||
public void testBackslashEscapesForwardSlash() {
|
||||
assertEquals(true, exec("'//' ==~ /\\/\\//"));
|
||||
}
|
||||
|
||||
public void testBackslashEscapeBackslash() {
|
||||
// Both of these are single backslashes but java escaping + Painless escaping....
|
||||
assertEquals(true, exec("'\\\\' ==~ /\\\\/"));
|
||||
}
|
||||
|
||||
public void testRegexIsNonGreedy() {
|
||||
assertEquals(true, exec("def s = /\\\\/.split('.\\\\.'); return s[1] ==~ /\\./"));
|
||||
}
|
||||
|
||||
public void testPatternAfterAssignment() {
|
||||
|
@ -248,11 +258,14 @@ public class RegexTests extends ScriptTestCase {
|
|||
}
|
||||
|
||||
public void testBadRegexPattern() {
|
||||
PatternSyntaxException e = expectScriptThrows(PatternSyntaxException.class, () -> {
|
||||
ScriptException e = expectThrows(ScriptException.class, () -> {
|
||||
exec("/\\ujjjj/"); // Invalid unicode
|
||||
});
|
||||
assertThat(e.getMessage(), containsString("Illegal Unicode escape sequence near index 2"));
|
||||
assertThat(e.getMessage(), containsString("\\ujjjj"));
|
||||
assertEquals("Error compiling regex: Illegal Unicode escape sequence", e.getCause().getMessage());
|
||||
|
||||
// And make sure the location of the error points to the offset inside the pattern
|
||||
assertEquals("/\\ujjjj/", e.getScriptStack().get(0));
|
||||
assertEquals(" ^---- HERE", e.getScriptStack().get(1));
|
||||
}
|
||||
|
||||
public void testRegexAgainstNumber() {
|
||||
|
|
Loading…
Reference in New Issue