From 257a7d77edf42b17e6c645bae99d5957f0b5732f Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Wed, 22 Mar 2017 15:56:17 -0400 Subject: [PATCH] Painless: Fix regex lexer and error messages (#23634) Without this change, if write a script with multiple regexes *sometimes* the lexer will decide to look at them like one big regex and then some trailing garbage. Like this discuss post: https://discuss.elastic.co/t/error-with-the-split-function-in-painless-script/79021 ``` def val = /\\\\/.split(ctx._source.event_data.param17); if (val[2] =~ /\\./) { def val2 = /\\./.split(val[2]); ctx._source['user_crash'] = val2[0] } else { ctx._source['user_crash'] = val[2] } ``` The error message you get from the lexer is `lexer_no_viable_alt_exception` right after the *second* regex. With this change each regex is just a single regex like it ought to be. As a bonus, while looking into this issue I found that the error reporting for regexes wasn't very nice. If you specify an invalid pattern then you get an error marker on the start of the pattern with the JVM's regex error message which attempts to point you to the location in the regex but is totally unreadable in the JSON response. This change fixes the location to point to the appropriate spot inside the pattern and removes the portion of the JVM's error message that doesn't render well. It is no longer needed now that we point users to the appropriate spot in the pattern. --- .../src/main/antlr/PainlessLexer.g4 | 2 +- .../painless/antlr/PainlessLexer.java | 54 +++++++++---------- .../elasticsearch/painless/node/ERegex.java | 5 +- .../elasticsearch/painless/RegexTests.java | 23 ++++++-- 4 files changed, 48 insertions(+), 36 deletions(-) diff --git a/modules/lang-painless/src/main/antlr/PainlessLexer.g4 b/modules/lang-painless/src/main/antlr/PainlessLexer.g4 index f60d48efcc7..6ab6a861135 100644 --- a/modules/lang-painless/src/main/antlr/PainlessLexer.g4 +++ b/modules/lang-painless/src/main/antlr/PainlessLexer.g4 @@ -120,7 +120,7 @@ INTEGER: ( '0' | [1-9] [0-9]* ) [lLfFdD]?; DECIMAL: ( '0' | [1-9] [0-9]* ) (DOT [0-9]+)? ( [eE] [+\-]? [0-9]+ )? [fFdD]?; STRING: ( '"' ( '\\"' | '\\\\' | ~[\\"] )*? '"' ) | ( '\'' ( '\\\'' | '\\\\' | ~[\\'] )*? '\'' ); -REGEX: '/' ( ~('/' | '\n') | '\\' ~'\n' )+ '/' [cilmsUux]* { slashIsRegex() }?; +REGEX: '/' ( '\\' ~'\n' | ~('/' | '\n') )+? '/' [cilmsUux]* { slashIsRegex() }?; TRUE: 'true'; FALSE: 'false'; diff --git a/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/PainlessLexer.java b/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/PainlessLexer.java index 44972061b59..fd32c59b4ff 100644 --- a/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/PainlessLexer.java +++ b/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/PainlessLexer.java @@ -1,7 +1,5 @@ // ANTLR GENERATED CODE: DO NOT EDIT package org.elasticsearch.painless.antlr; - - import org.antlr.v4.runtime.Lexer; import org.antlr.v4.runtime.CharStream; import org.antlr.v4.runtime.Token; @@ -211,29 +209,29 @@ abstract class PainlessLexer extends Lexer { "\3N\3N\7N\u021a\nN\fN\16N\u021d\13N\3N\3N\3O\3O\3O\3O\3O\3P\3P\3P\3P\3"+ "P\3P\3Q\3Q\3Q\3Q\3Q\3R\3R\3R\3R\7R\u0235\nR\fR\16R\u0238\13R\3R\3R\3S"+ "\3S\7S\u023e\nS\fS\16S\u0241\13S\3T\3T\3T\7T\u0246\nT\fT\16T\u0249\13"+ - "T\5T\u024b\nT\3T\3T\3U\3U\7U\u0251\nU\fU\16U\u0254\13U\3U\3U\6\u00b9\u00c3"+ - "\u01fd\u0209\2V\4\3\6\4\b\5\n\6\f\7\16\b\20\t\22\n\24\13\26\f\30\r\32"+ - "\16\34\17\36\20 \21\"\22$\23&\24(\25*\26,\27.\30\60\31\62\32\64\33\66"+ - "\348\35:\36<\37> @!B\"D#F$H%J&L\'N(P)R*T+V,X-Z.\\/^\60`\61b\62d\63f\64"+ - "h\65j\66l\67n8p9r:t;v|?~@\u0080A\u0082B\u0084C\u0086D\u0088E\u008a"+ - "F\u008cG\u008eH\u0090I\u0092J\u0094K\u0096L\u0098M\u009aN\u009cO\u009e"+ - "P\u00a0Q\u00a2R\u00a4S\u00a6T\u00a8U\u00aaV\4\2\3\25\5\2\13\f\17\17\""+ - "\"\4\2\f\f\17\17\3\2\629\4\2NNnn\4\2ZZzz\5\2\62;CHch\3\2\63;\3\2\62;\b"+ - "\2FFHHNNffhhnn\4\2GGgg\4\2--//\6\2FFHHffhh\4\2$$^^\4\2))^^\4\2\f\f\61"+ - "\61\3\2\f\f\t\2WWeekknouuwwzz\5\2C\\aac|\6\2\62;C\\aac|\u0277\2\4\3\2"+ - "\2\2\2\6\3\2\2\2\2\b\3\2\2\2\2\n\3\2\2\2\2\f\3\2\2\2\2\16\3\2\2\2\2\20"+ - "\3\2\2\2\2\22\3\2\2\2\2\24\3\2\2\2\2\26\3\2\2\2\2\30\3\2\2\2\2\32\3\2"+ - "\2\2\2\34\3\2\2\2\2\36\3\2\2\2\2 \3\2\2\2\2\"\3\2\2\2\2$\3\2\2\2\2&\3"+ - "\2\2\2\2(\3\2\2\2\2*\3\2\2\2\2,\3\2\2\2\2.\3\2\2\2\2\60\3\2\2\2\2\62\3"+ - "\2\2\2\2\64\3\2\2\2\2\66\3\2\2\2\28\3\2\2\2\2:\3\2\2\2\2<\3\2\2\2\2>\3"+ - "\2\2\2\2@\3\2\2\2\2B\3\2\2\2\2D\3\2\2\2\2F\3\2\2\2\2H\3\2\2\2\2J\3\2\2"+ - "\2\2L\3\2\2\2\2N\3\2\2\2\2P\3\2\2\2\2R\3\2\2\2\2T\3\2\2\2\2V\3\2\2\2\2"+ - "X\3\2\2\2\2Z\3\2\2\2\2\\\3\2\2\2\2^\3\2\2\2\2`\3\2\2\2\2b\3\2\2\2\2d\3"+ - "\2\2\2\2f\3\2\2\2\2h\3\2\2\2\2j\3\2\2\2\2l\3\2\2\2\2n\3\2\2\2\2p\3\2\2"+ - "\2\2r\3\2\2\2\2t\3\2\2\2\2v\3\2\2\2\2x\3\2\2\2\2z\3\2\2\2\2|\3\2\2\2\2"+ - "~\3\2\2\2\2\u0080\3\2\2\2\2\u0082\3\2\2\2\2\u0084\3\2\2\2\2\u0086\3\2"+ - "\2\2\2\u0088\3\2\2\2\2\u008a\3\2\2\2\2\u008c\3\2\2\2\2\u008e\3\2\2\2\2"+ - "\u0090\3\2\2\2\2\u0092\3\2\2\2\2\u0094\3\2\2\2\2\u0096\3\2\2\2\2\u0098"+ + "T\5T\u024b\nT\3T\3T\3U\3U\7U\u0251\nU\fU\16U\u0254\13U\3U\3U\7\u00b9\u00c3"+ + "\u01fd\u0209\u0215\2V\4\3\6\4\b\5\n\6\f\7\16\b\20\t\22\n\24\13\26\f\30"+ + "\r\32\16\34\17\36\20 \21\"\22$\23&\24(\25*\26,\27.\30\60\31\62\32\64\33"+ + "\66\348\35:\36<\37> @!B\"D#F$H%J&L\'N(P)R*T+V,X-Z.\\/^\60`\61b\62d\63"+ + "f\64h\65j\66l\67n8p9r:t;v|?~@\u0080A\u0082B\u0084C\u0086D\u0088E"+ + "\u008aF\u008cG\u008eH\u0090I\u0092J\u0094K\u0096L\u0098M\u009aN\u009c"+ + "O\u009eP\u00a0Q\u00a2R\u00a4S\u00a6T\u00a8U\u00aaV\4\2\3\25\5\2\13\f\17"+ + "\17\"\"\4\2\f\f\17\17\3\2\629\4\2NNnn\4\2ZZzz\5\2\62;CHch\3\2\63;\3\2"+ + "\62;\b\2FFHHNNffhhnn\4\2GGgg\4\2--//\6\2FFHHffhh\4\2$$^^\4\2))^^\3\2\f"+ + "\f\4\2\f\f\61\61\t\2WWeekknouuwwzz\5\2C\\aac|\6\2\62;C\\aac|\u0277\2\4"+ + "\3\2\2\2\2\6\3\2\2\2\2\b\3\2\2\2\2\n\3\2\2\2\2\f\3\2\2\2\2\16\3\2\2\2"+ + "\2\20\3\2\2\2\2\22\3\2\2\2\2\24\3\2\2\2\2\26\3\2\2\2\2\30\3\2\2\2\2\32"+ + "\3\2\2\2\2\34\3\2\2\2\2\36\3\2\2\2\2 \3\2\2\2\2\"\3\2\2\2\2$\3\2\2\2\2"+ + "&\3\2\2\2\2(\3\2\2\2\2*\3\2\2\2\2,\3\2\2\2\2.\3\2\2\2\2\60\3\2\2\2\2\62"+ + "\3\2\2\2\2\64\3\2\2\2\2\66\3\2\2\2\28\3\2\2\2\2:\3\2\2\2\2<\3\2\2\2\2"+ + ">\3\2\2\2\2@\3\2\2\2\2B\3\2\2\2\2D\3\2\2\2\2F\3\2\2\2\2H\3\2\2\2\2J\3"+ + "\2\2\2\2L\3\2\2\2\2N\3\2\2\2\2P\3\2\2\2\2R\3\2\2\2\2T\3\2\2\2\2V\3\2\2"+ + "\2\2X\3\2\2\2\2Z\3\2\2\2\2\\\3\2\2\2\2^\3\2\2\2\2`\3\2\2\2\2b\3\2\2\2"+ + "\2d\3\2\2\2\2f\3\2\2\2\2h\3\2\2\2\2j\3\2\2\2\2l\3\2\2\2\2n\3\2\2\2\2p"+ + "\3\2\2\2\2r\3\2\2\2\2t\3\2\2\2\2v\3\2\2\2\2x\3\2\2\2\2z\3\2\2\2\2|\3\2"+ + "\2\2\2~\3\2\2\2\2\u0080\3\2\2\2\2\u0082\3\2\2\2\2\u0084\3\2\2\2\2\u0086"+ + "\3\2\2\2\2\u0088\3\2\2\2\2\u008a\3\2\2\2\2\u008c\3\2\2\2\2\u008e\3\2\2"+ + "\2\2\u0090\3\2\2\2\2\u0092\3\2\2\2\2\u0094\3\2\2\2\2\u0096\3\2\2\2\2\u0098"+ "\3\2\2\2\2\u009a\3\2\2\2\2\u009c\3\2\2\2\2\u009e\3\2\2\2\2\u00a0\3\2\2"+ "\2\2\u00a2\3\2\2\2\2\u00a4\3\2\2\2\2\u00a6\3\2\2\2\3\u00a8\3\2\2\2\3\u00aa"+ "\3\2\2\2\4\u00ad\3\2\2\2\6\u00c8\3\2\2\2\b\u00cc\3\2\2\2\n\u00ce\3\2\2"+ @@ -358,9 +356,9 @@ abstract class PainlessLexer extends Lexer { "\3\2\2\2\u0207\u0206\3\2\2\2\u0208\u020b\3\2\2\2\u0209\u020a\3\2\2\2\u0209"+ "\u0207\3\2\2\2\u020a\u020c\3\2\2\2\u020b\u0209\3\2\2\2\u020c\u020e\7)"+ "\2\2\u020d\u01f5\3\2\2\2\u020d\u0201\3\2\2\2\u020e\u009b\3\2\2\2\u020f"+ - "\u0213\7\61\2\2\u0210\u0214\n\20\2\2\u0211\u0212\7^\2\2\u0212\u0214\n"+ - "\21\2\2\u0213\u0210\3\2\2\2\u0213\u0211\3\2\2\2\u0214\u0215\3\2\2\2\u0215"+ - "\u0213\3\2\2\2\u0215\u0216\3\2\2\2\u0216\u0217\3\2\2\2\u0217\u021b\7\61"+ + "\u0213\7\61\2\2\u0210\u0211\7^\2\2\u0211\u0214\n\20\2\2\u0212\u0214\n"+ + "\21\2\2\u0213\u0210\3\2\2\2\u0213\u0212\3\2\2\2\u0214\u0215\3\2\2\2\u0215"+ + "\u0216\3\2\2\2\u0215\u0213\3\2\2\2\u0216\u0217\3\2\2\2\u0217\u021b\7\61"+ "\2\2\u0218\u021a\t\22\2\2\u0219\u0218\3\2\2\2\u021a\u021d\3\2\2\2\u021b"+ "\u0219\3\2\2\2\u021b\u021c\3\2\2\2\u021c\u021e\3\2\2\2\u021d\u021b\3\2"+ "\2\2\u021e\u021f\6N\3\2\u021f\u009d\3\2\2\2\u0220\u0221\7v\2\2\u0221\u0222"+ diff --git a/modules/lang-painless/src/main/java/org/elasticsearch/painless/node/ERegex.java b/modules/lang-painless/src/main/java/org/elasticsearch/painless/node/ERegex.java index 1d1f41948c4..4b38868b1b1 100644 --- a/modules/lang-painless/src/main/java/org/elasticsearch/painless/node/ERegex.java +++ b/modules/lang-painless/src/main/java/org/elasticsearch/painless/node/ERegex.java @@ -68,8 +68,9 @@ public final class ERegex extends AExpression { try { Pattern.compile(pattern, flags); - } catch (PatternSyntaxException exception) { - throw createError(exception); + } catch (PatternSyntaxException e) { + throw new Location(location.getSourceName(), location.getOffset() + 1 + e.getIndex()).createError( + new IllegalArgumentException("Error compiling regex: " + e.getDescription())); } constant = new Constant(location, Definition.PATTERN_TYPE.type, "regexAt$" + location.getOffset(), this::initializeConstant); diff --git a/modules/lang-painless/src/test/java/org/elasticsearch/painless/RegexTests.java b/modules/lang-painless/src/test/java/org/elasticsearch/painless/RegexTests.java index 1c53692ad74..83a592b3f26 100644 --- a/modules/lang-painless/src/test/java/org/elasticsearch/painless/RegexTests.java +++ b/modules/lang-painless/src/test/java/org/elasticsearch/painless/RegexTests.java @@ -20,6 +20,7 @@ package org.elasticsearch.painless; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.script.ScriptException; import java.nio.CharBuffer; import java.util.Arrays; @@ -44,8 +45,17 @@ public class RegexTests extends ScriptTestCase { assertEquals(false, exec("return 'bar' ==~ /foo/")); } - public void testSlashesEscapePattern() { - assertEquals(true, exec("return '//' ==~ /\\/\\//")); + public void testBackslashEscapesForwardSlash() { + assertEquals(true, exec("'//' ==~ /\\/\\//")); + } + + public void testBackslashEscapeBackslash() { + // Both of these are single backslashes but java escaping + Painless escaping.... + assertEquals(true, exec("'\\\\' ==~ /\\\\/")); + } + + public void testRegexIsNonGreedy() { + assertEquals(true, exec("def s = /\\\\/.split('.\\\\.'); return s[1] ==~ /\\./")); } public void testPatternAfterAssignment() { @@ -248,11 +258,14 @@ public class RegexTests extends ScriptTestCase { } public void testBadRegexPattern() { - PatternSyntaxException e = expectScriptThrows(PatternSyntaxException.class, () -> { + ScriptException e = expectThrows(ScriptException.class, () -> { exec("/\\ujjjj/"); // Invalid unicode }); - assertThat(e.getMessage(), containsString("Illegal Unicode escape sequence near index 2")); - assertThat(e.getMessage(), containsString("\\ujjjj")); + assertEquals("Error compiling regex: Illegal Unicode escape sequence", e.getCause().getMessage()); + + // And make sure the location of the error points to the offset inside the pattern + assertEquals("/\\ujjjj/", e.getScriptStack().get(0)); + assertEquals(" ^---- HERE", e.getScriptStack().get(1)); } public void testRegexAgainstNumber() {