Painless: move semicolon hack into lexer

Perviously we used token level lookbehind in the parser. That worked,
but only if the parser didn't have any ambiguity *at all*. Since the
parser has ambiguity it didn't work everywhere. In particular it failed
when parsing blocks in lambdas like `a -> {int b = a + 2; b * b}`.

This moves the hack from the parser into the lexer. There we can use
token lookbehind (same trick) to *insert* semicolons into the token
stream. This works much better for antlr because antlr's prediction
code can work with real tokens.

Also, the lexer is simpler than the parser, so if there is a place
to introduce a hack, that is a better place.
This commit is contained in:
Nik Everett 2016-06-16 17:29:10 -04:00
parent d09d89f8c5
commit 1e16c22d03
10 changed files with 471 additions and 403 deletions

View File

@ -56,7 +56,7 @@ THIS: 'this';
BOOLNOT: '!';
BWNOT: '~';
MUL: '*';
DIV: '/' { false == SlashStrategy.slashIsRegex(_factory) }?;
DIV: '/' { false == SlashStrategy.slashIsRegex(this) }?;
REM: '%';
ADD: '+';
SUB: '-';
@ -104,7 +104,7 @@ INTEGER: ( '0' | [1-9] [0-9]* ) [lLfFdD]?;
DECIMAL: ( '0' | [1-9] [0-9]* ) (DOT [0-9]+)? ( [eE] [+\-]? [0-9]+ )? [fF]?;
STRING: ( '"' ( '\\"' | '\\\\' | ~[\\"] )*? '"' ) | ( '\'' ( '\\\'' | '\\\\' | ~[\\"] )*? '\'' );
REGEX: '/' ( ~('/' | '\n') | '\\' ~'\n' )+ '/' [cilmsUux]* { SlashStrategy.slashIsRegex(_factory) }?;
REGEX: '/' ( ~('/' | '\n') | '\\' ~'\n' )+ '/' [cilmsUux]* { SlashStrategy.slashIsRegex(this) }?;
TRUE: 'true';
FALSE: 'false';

View File

@ -92,17 +92,6 @@ trap
delimiter
: SEMICOLON
| EOF
// RBRACK is a delimiter but we don't consume it because it is only valid
// in places where RBRACK can follow the statement. It is simpler to not
// consume it here then it is to consume it here. Unfortunately, they
// obvious syntax to do this `| { _input.LA(1) == RBRACK }?` generates an
// amazingly intense `adaptivePredict` call that doesn't actually work
// and builds a serious DFA. Huge. So instead we use standard ANTLR syntax
// to consume the token and then undo the consumption. This looks hairy but
// it is better than the alternatives.
| { int mark = _input.mark(); int index = _input.index(); }
RBRACK
{ _input.seek(index); _input.release(mark); }
;
// Note we return the boolean s. This is returned as true

View File

@ -0,0 +1,96 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.painless.antlr;
import org.antlr.v4.runtime.CharStream;
import org.antlr.v4.runtime.Lexer;
import org.antlr.v4.runtime.LexerNoViableAltException;
import org.antlr.v4.runtime.Token;
import org.antlr.v4.runtime.TokenSource;
import org.antlr.v4.runtime.misc.Interval;
import org.antlr.v4.runtime.misc.Pair;
import org.elasticsearch.painless.Location;
/**
* A lexer that is customized for painless. It will:
* <ul>
* <li>will override the default error behavior to fail on the first error
* <li>store the last token in case we need to do lookbehind for semicolon insertion and regex vs division detection
* <li>insert semicolons where they'd improve the language's readability. Rather than hack this into the parser and create a ton of
* ambiguity we hack them here where we can use heuristics to do it quickly.
* </ul>
*/
final class EnhancedPainlessLexer extends PainlessLexer {
final String sourceName;
private Token stashedNext = null;
private Token previous = null;
EnhancedPainlessLexer(CharStream charStream, String sourceName) {
super(charStream);
this.sourceName = sourceName;
}
public Token getPreviousToken() {
return previous;
}
@Override
public Token nextToken() {
if (stashedNext != null) {
previous = stashedNext;
stashedNext = null;
return previous;
}
Token next = super.nextToken();
if (insertSemicolon(previous, next)) {
stashedNext = next;
previous = _factory.create(new Pair<TokenSource, CharStream>(this, _input), PainlessLexer.SEMICOLON, ";",
Lexer.DEFAULT_TOKEN_CHANNEL, next.getStartIndex(), next.getStopIndex(), next.getLine(), next.getCharPositionInLine());
return previous;
} else {
previous = next;
return next;
}
}
@Override
public void recover(final LexerNoViableAltException lnvae) {
final CharStream charStream = lnvae.getInputStream();
final int startIndex = lnvae.getStartIndex();
final String text = charStream.getText(Interval.of(startIndex, charStream.index()));
Location location = new Location(sourceName, _tokenStartCharIndex);
throw location.createError(new IllegalArgumentException("unexpected character [" + getErrorDisplay(text) + "].", lnvae));
}
private static boolean insertSemicolon(Token previous, Token next) {
if (previous == null || next.getType() != PainlessLexer.RBRACK) {
return false;
}
switch (previous.getType()) {
case PainlessLexer.RBRACK: // };} would be weird!
case PainlessLexer.SEMICOLON: // already have a semicolon, no need to add one
case PainlessLexer.LBRACK: // empty blocks don't need a semicolon
return false;
default:
return true;
}
}
}

View File

@ -140,14 +140,14 @@ class PainlessLexer extends Lexer {
private boolean DIV_sempred(RuleContext _localctx, int predIndex) {
switch (predIndex) {
case 0:
return false == SlashStrategy.slashIsRegex(_factory) ;
return false == SlashStrategy.slashIsRegex(this) ;
}
return true;
}
private boolean REGEX_sempred(RuleContext _localctx, int predIndex) {
switch (predIndex) {
case 1:
return SlashStrategy.slashIsRegex(_factory) ;
return SlashStrategy.slashIsRegex(this) ;
}
return true;
}

View File

@ -20,15 +20,14 @@
package org.elasticsearch.painless.antlr;
import org.antlr.v4.runtime.Token;
import org.antlr.v4.runtime.TokenFactory;
/**
* Utility to figure out if a {@code /} is division or the start of a regex literal.
*/
public class SlashStrategy {
public static boolean slashIsRegex(TokenFactory<?> factory) {
StashingTokenFactory<?> stashingFactory = (StashingTokenFactory<?>) factory;
Token lastToken = stashingFactory.getLastToken();
public static boolean slashIsRegex(PainlessLexer lexer) {
EnhancedPainlessLexer realLexer = (EnhancedPainlessLexer) lexer;
Token lastToken = realLexer.getPreviousToken();
if (lastToken == null) {
return true;
}

View File

@ -183,7 +183,7 @@ public final class Walker extends PainlessParserBaseVisitor<Object> {
private SourceContext buildAntlrTree(String source) {
ANTLRInputStream stream = new ANTLRInputStream(source);
PainlessLexer lexer = new ErrorHandlingLexer(stream, sourceName);
PainlessLexer lexer = new EnhancedPainlessLexer(stream, sourceName);
PainlessParser parser = new PainlessParser(new CommonTokenStream(lexer));
ParserErrorStrategy strategy = new ParserErrorStrategy(sourceName);

View File

@ -79,7 +79,11 @@ public class LambdaTests extends ScriptTestCase {
}
public void testMultipleStatements() {
assertEquals(2, exec("int applyOne(IntFunction arg) { arg.apply(1) } applyOne(x -> { x = x + 1; return x;})"));
assertEquals(2, exec("int applyOne(IntFunction arg) { arg.apply(1) } applyOne(x -> { x = x + 1; return x })"));
}
public void testUnneededCurlyStatements() {
assertEquals(2, exec("int applyOne(IntFunction arg) { arg.apply(1) } applyOne(x -> { x + 1 })"));
}
public void testTwoLambdas() {

View File

@ -201,6 +201,6 @@ public class RegexTests extends ScriptTestCase {
IllegalArgumentException e = expectScriptThrows(IllegalArgumentException.class, () -> {
exec("/asdf/b", emptyMap(), emptyMap(), null); // Not picky so we get a non-assertion error
});
assertEquals("invalid sequence of tokens near ['b'].", e.getMessage());
assertEquals("unexpected token ['b'] was expecting one of [{<EOF>, ';'}].", e.getMessage());
}
}

View File

@ -40,7 +40,7 @@ public class ParserTests extends ScriptTestCase {
private SourceContext buildAntlrTree(String source) {
ANTLRInputStream stream = new ANTLRInputStream(source);
PainlessLexer lexer = new ErrorHandlingLexer(stream, "testing");
PainlessLexer lexer = new EnhancedPainlessLexer(stream, "testing");
PainlessParser parser = new PainlessParser(new CommonTokenStream(lexer));
ParserErrorStrategy strategy = new ParserErrorStrategy("testing");