Painless: move semicolon hack into lexer

Perviously we used token level lookbehind in the parser. That worked, but only if the parser didn't have any ambiguity *at all*. Since the parser has ambiguity it didn't work everywhere. In particular it failed when parsing blocks in lambdas like `a -> {int b = a + 2; b * b}`. This moves the hack from the parser into the lexer. There we can use token lookbehind (same trick) to *insert* semicolons into the token stream. This works much better for antlr because antlr's prediction code can work with real tokens. Also, the lexer is simpler than the parser, so if there is a place to introduce a hack, that is a better place.
2016-06-16 17:29:10 -04:00 · 2016-06-16 17:29:10 -04:00 · 1e16c22d03
parent d09d89f8c5
commit 1e16c22d03
10 changed files with 471 additions and 403 deletions
--- a/modules/lang-painless/src/main/antlr/PainlessLexer.g4
+++ b/modules/lang-painless/src/main/antlr/PainlessLexer.g4
@ -56,7 +56,7 @@ THIS:      'this';
 BOOLNOT: '!';
 BWNOT:   '~';
 MUL:     '*';
-DIV:     '/' { false == SlashStrategy.slashIsRegex(_factory) }?;
+DIV:     '/' { false == SlashStrategy.slashIsRegex(this) }?;
 REM:     '%';
 ADD:     '+';
 SUB:     '-';
@ -104,7 +104,7 @@ INTEGER: ( '0' | [1-9] [0-9]* ) [lLfFdD]?;
 DECIMAL: ( '0' | [1-9] [0-9]* ) (DOT [0-9]+)? ( [eE] [+\-]? [0-9]+ )? [fF]?;

 STRING: ( '"' ( '\\"' | '\\\\' | ~[\\"] )*? '"' ) | ( '\'' ( '\\\'' | '\\\\' | ~[\\"] )*? '\'' );
-REGEX: '/' ( ~('/' | '\n') | '\\' ~'\n' )+ '/' [cilmsUux]* { SlashStrategy.slashIsRegex(_factory) }?;
+REGEX: '/' ( ~('/' | '\n') | '\\' ~'\n' )+ '/' [cilmsUux]* { SlashStrategy.slashIsRegex(this) }?;

 TRUE:  'true';
 FALSE: 'false';
--- a/modules/lang-painless/src/main/antlr/PainlessParser.g4
+++ b/modules/lang-painless/src/main/antlr/PainlessParser.g4
@ -92,17 +92,6 @@ trap
 delimiter
    : SEMICOLON
    | EOF
-    // RBRACK is a delimiter but we don't consume it because it is only valid
-    // in places where RBRACK can follow the statement. It is simpler to not
-    // consume it here then it is to consume it here. Unfortunately, they
-    // obvious syntax to do this `| { _input.LA(1) == RBRACK }?` generates an
-    // amazingly intense `adaptivePredict` call that doesn't actually work
-    // and builds a serious DFA. Huge. So instead we use standard ANTLR syntax
-    // to consume the token and then undo the consumption. This looks hairy but
-    // it is better than the alternatives.
-    |   { int mark = _input.mark(); int index = _input.index(); }
-            RBRACK
-        { _input.seek(index); _input.release(mark); }
    ;

 // Note we return the boolean s.  This is returned as true
--- a/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/EnhancedPainlessLexer.java
+++ b/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/EnhancedPainlessLexer.java
@ -0,0 +1,96 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.painless.antlr;
+
+import org.antlr.v4.runtime.CharStream;
+import org.antlr.v4.runtime.Lexer;
+import org.antlr.v4.runtime.LexerNoViableAltException;
+import org.antlr.v4.runtime.Token;
+import org.antlr.v4.runtime.TokenSource;
+import org.antlr.v4.runtime.misc.Interval;
+import org.antlr.v4.runtime.misc.Pair;
+import org.elasticsearch.painless.Location;
+
+/**
+ * A lexer that is customized for painless. It will:
+ * <ul>
+ * <li>will override the default error behavior to fail on the first error
+ * <li>store the last token in case we need to do lookbehind for semicolon insertion and regex vs division detection
+ * <li>insert semicolons where they'd improve the language's readability. Rather than hack this into the parser and create a ton of
+ * ambiguity we hack them here where we can use heuristics to do it quickly.
+ * </ul>
+ */
+final class EnhancedPainlessLexer extends PainlessLexer {
+    final String sourceName;
+    private Token stashedNext = null;
+    private Token previous = null;
+
+    EnhancedPainlessLexer(CharStream charStream, String sourceName) {
+        super(charStream);
+        this.sourceName = sourceName;
+    }
+
+    public Token getPreviousToken() {
+        return previous;
+    }
+
+    @Override
+    public Token nextToken() {
+        if (stashedNext != null) {
+            previous = stashedNext;
+            stashedNext = null;
+            return previous;
+        }
+        Token next = super.nextToken();
+        if (insertSemicolon(previous, next)) {
+            stashedNext = next;
+            previous = _factory.create(new Pair<TokenSource, CharStream>(this, _input), PainlessLexer.SEMICOLON, ";",
+                    Lexer.DEFAULT_TOKEN_CHANNEL, next.getStartIndex(), next.getStopIndex(), next.getLine(), next.getCharPositionInLine());
+            return previous;
+        } else {
+            previous = next;
+            return next;
+        }
+    }
+
+    @Override
+    public void recover(final LexerNoViableAltException lnvae) {
+        final CharStream charStream = lnvae.getInputStream();
+        final int startIndex = lnvae.getStartIndex();
+        final String text = charStream.getText(Interval.of(startIndex, charStream.index()));
+
+        Location location = new Location(sourceName, _tokenStartCharIndex);
+        throw location.createError(new IllegalArgumentException("unexpected character [" + getErrorDisplay(text) + "].", lnvae));
+    }
+
+    private static boolean insertSemicolon(Token previous, Token next) {
+        if (previous == null || next.getType() != PainlessLexer.RBRACK) {
+            return false;
+        }
+        switch (previous.getType()) {
+        case PainlessLexer.RBRACK:     // };} would be weird!
+        case PainlessLexer.SEMICOLON:  // already have a semicolon, no need to add one
+        case PainlessLexer.LBRACK:     // empty blocks don't need a semicolon
+            return false;
+        default:
+            return true;
+        }
+    }
+}
--- a/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/PainlessLexer.java
+++ b/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/PainlessLexer.java
@ -140,14 +140,14 @@ class PainlessLexer extends Lexer {
  private boolean DIV_sempred(RuleContext _localctx, int predIndex) {
    switch (predIndex) {
    case 0:
-      return  false == SlashStrategy.slashIsRegex(_factory) ;
+      return  false == SlashStrategy.slashIsRegex(this) ;
    }
    return true;
  }
  private boolean REGEX_sempred(RuleContext _localctx, int predIndex) {
    switch (predIndex) {
    case 1:
-      return  SlashStrategy.slashIsRegex(_factory) ;
+      return  SlashStrategy.slashIsRegex(this) ;
    }
    return true;
  }
--- a/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/PainlessParser.java
+++ b/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/PainlessParser.java
--- a/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/SlashStrategy.java
+++ b/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/SlashStrategy.java
@ -20,15 +20,14 @@
 package org.elasticsearch.painless.antlr;

 import org.antlr.v4.runtime.Token;
-import org.antlr.v4.runtime.TokenFactory;

 /**
 * Utility to figure out if a {@code /} is division or the start of a regex literal.
 */
 public class SlashStrategy {
-    public static boolean slashIsRegex(TokenFactory<?> factory) {
-        StashingTokenFactory<?> stashingFactory = (StashingTokenFactory<?>) factory;
-        Token lastToken = stashingFactory.getLastToken();
+    public static boolean slashIsRegex(PainlessLexer lexer) {
+        EnhancedPainlessLexer realLexer = (EnhancedPainlessLexer) lexer;
+        Token lastToken = realLexer.getPreviousToken();
        if (lastToken == null) {
            return true;
        }
--- a/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/Walker.java
+++ b/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/Walker.java
@ -183,7 +183,7 @@ public final class Walker extends PainlessParserBaseVisitor<Object> {

    private SourceContext buildAntlrTree(String source) {
        ANTLRInputStream stream = new ANTLRInputStream(source);
-        PainlessLexer lexer = new ErrorHandlingLexer(stream, sourceName);
+        PainlessLexer lexer = new EnhancedPainlessLexer(stream, sourceName);
        PainlessParser parser = new PainlessParser(new CommonTokenStream(lexer));
        ParserErrorStrategy strategy = new ParserErrorStrategy(sourceName);

--- a/modules/lang-painless/src/test/java/org/elasticsearch/painless/LambdaTests.java
+++ b/modules/lang-painless/src/test/java/org/elasticsearch/painless/LambdaTests.java
@ -79,7 +79,11 @@ public class LambdaTests extends ScriptTestCase {
    }

    public void testMultipleStatements() {
-        assertEquals(2, exec("int applyOne(IntFunction arg) { arg.apply(1) } applyOne(x -> { x = x + 1; return x;})"));
+        assertEquals(2, exec("int applyOne(IntFunction arg) { arg.apply(1) } applyOne(x -> { x = x + 1; return x })"));
+    }
+
+    public void testUnneededCurlyStatements() {
+        assertEquals(2, exec("int applyOne(IntFunction arg) { arg.apply(1) } applyOne(x -> { x + 1 })"));
    }

    public void testTwoLambdas() {
--- a/modules/lang-painless/src/test/java/org/elasticsearch/painless/RegexTests.java
+++ b/modules/lang-painless/src/test/java/org/elasticsearch/painless/RegexTests.java
@ -201,6 +201,6 @@ public class RegexTests extends ScriptTestCase {
        IllegalArgumentException e = expectScriptThrows(IllegalArgumentException.class, () -> {
            exec("/asdf/b", emptyMap(), emptyMap(), null); // Not picky so we get a non-assertion error
        });
-        assertEquals("invalid sequence of tokens near ['b'].", e.getMessage());
+        assertEquals("unexpected token ['b'] was expecting one of [{<EOF>, ';'}].", e.getMessage());
    }
 }
--- a/modules/lang-painless/src/test/java/org/elasticsearch/painless/antlr/ParserTests.java
+++ b/modules/lang-painless/src/test/java/org/elasticsearch/painless/antlr/ParserTests.java
@ -40,7 +40,7 @@ public class ParserTests extends ScriptTestCase {

    private SourceContext buildAntlrTree(String source) {
        ANTLRInputStream stream = new ANTLRInputStream(source);
-        PainlessLexer lexer = new ErrorHandlingLexer(stream, "testing");
+        PainlessLexer lexer = new EnhancedPainlessLexer(stream, "testing");
        PainlessParser parser = new PainlessParser(new CommonTokenStream(lexer));
        ParserErrorStrategy strategy = new ParserErrorStrategy("testing");