Move lexer hacks to EnhancedPainlessLexer

This "feels" nicer. Less classes at least.
This commit is contained in:
Nik Everett 2017-01-19 11:23:16 -05:00
parent e2da6a8ee5
commit dbb4a2ca6c
4 changed files with 54 additions and 59 deletions

View File

@ -20,7 +20,15 @@
lexer grammar PainlessLexer; lexer grammar PainlessLexer;
@header { @header {
import org.elasticsearch.painless.Definition; }
@members{
protected boolean isSimpleType(String name) {
throw new UnsupportedOperationException("Must be implemented in a subclass");
}
protected boolean slashIsRegex() {
throw new UnsupportedOperationException("Must be implemented in a subclass");
}
} }
WS: [ \t\n\r]+ -> skip; WS: [ \t\n\r]+ -> skip;
@ -59,7 +67,7 @@ INSTANCEOF: 'instanceof';
BOOLNOT: '!'; BOOLNOT: '!';
BWNOT: '~'; BWNOT: '~';
MUL: '*'; MUL: '*';
DIV: '/' { false == SlashStrategy.slashIsRegex(this) }?; DIV: '/' { false == slashIsRegex() }?;
REM: '%'; REM: '%';
ADD: '+'; ADD: '+';
SUB: '-'; SUB: '-';
@ -108,7 +116,7 @@ INTEGER: ( '0' | [1-9] [0-9]* ) [lLfFdD]?;
DECIMAL: ( '0' | [1-9] [0-9]* ) (DOT [0-9]+)? ( [eE] [+\-]? [0-9]+ )? [fFdD]?; DECIMAL: ( '0' | [1-9] [0-9]* ) (DOT [0-9]+)? ( [eE] [+\-]? [0-9]+ )? [fFdD]?;
STRING: ( '"' ( '\\"' | '\\\\' | ~[\\"] )*? '"' ) | ( '\'' ( '\\\'' | '\\\\' | ~[\\'] )*? '\'' ); STRING: ( '"' ( '\\"' | '\\\\' | ~[\\"] )*? '"' ) | ( '\'' ( '\\\'' | '\\\\' | ~[\\'] )*? '\'' );
REGEX: '/' ( ~('/' | '\n') | '\\' ~'\n' )+ '/' [cilmsUux]* { SlashStrategy.slashIsRegex(this) }?; REGEX: '/' ( ~('/' | '\n') | '\\' ~'\n' )+ '/' [cilmsUux]* { slashIsRegex() }?;
TRUE: 'true'; TRUE: 'true';
FALSE: 'false'; FALSE: 'false';
@ -121,7 +129,7 @@ NULL: 'null';
// or not. Note this works by processing one character at a time // or not. Note this works by processing one character at a time
// and the rule is added or removed as this happens. This is also known // and the rule is added or removed as this happens. This is also known
// as "the lexer hack." See (https://en.wikipedia.org/wiki/The_lexer_hack). // as "the lexer hack." See (https://en.wikipedia.org/wiki/The_lexer_hack).
TYPE: ID ( DOT ID )* { Definition.isSimpleType(getText()) }?; TYPE: ID ( DOT ID )* { isSimpleType(getText()) }?;
ID: [_a-zA-Z] [_a-zA-Z0-9]*; ID: [_a-zA-Z] [_a-zA-Z0-9]*;
mode AFTER_DOT; mode AFTER_DOT;

View File

@ -26,13 +26,15 @@ import org.antlr.v4.runtime.Token;
import org.antlr.v4.runtime.TokenSource; import org.antlr.v4.runtime.TokenSource;
import org.antlr.v4.runtime.misc.Interval; import org.antlr.v4.runtime.misc.Interval;
import org.antlr.v4.runtime.misc.Pair; import org.antlr.v4.runtime.misc.Pair;
import org.elasticsearch.painless.Definition;
import org.elasticsearch.painless.Location; import org.elasticsearch.painless.Location;
/** /**
* A lexer that is customized for painless. It: * A lexer that is customized for painless. It:
* <ul> * <ul>
* <li>Overrides the default error behavior to fail on the first error * <li>Overrides the default error behavior to fail on the first error.
* <li>Stores the last token in case we need to do lookbehind for semicolon insertion and regex vs division detection * <li>Stores the last token in case we need to do lookbehind for semicolon insertion and regex vs division detection.
* <li>Implements the regex vs division detection.
* <li>Insert semicolons where they'd improve the language's readability. Rather than hack this into the parser and create a ton of * <li>Insert semicolons where they'd improve the language's readability. Rather than hack this into the parser and create a ton of
* ambiguity we hack them here where we can use heuristics to do it quickly. * ambiguity we hack them here where we can use heuristics to do it quickly.
* <li>Enhances the error message when a string contains invalid escape sequences to include a list of valid escape sequences. * <li>Enhances the error message when a string contains invalid escape sequences to include a list of valid escape sequences.
@ -89,6 +91,33 @@ final class EnhancedPainlessLexer extends PainlessLexer {
throw location.createError(new IllegalArgumentException(message, lnvae)); throw location.createError(new IllegalArgumentException(message, lnvae));
} }
@Override
protected boolean isSimpleType(String name) {
return Definition.isSimpleType(name);
}
@Override
protected boolean slashIsRegex() {
Token lastToken = getPreviousToken();
if (lastToken == null) {
return true;
}
switch (lastToken.getType()) {
case PainlessLexer.RBRACE:
case PainlessLexer.RP:
case PainlessLexer.OCTAL:
case PainlessLexer.HEX:
case PainlessLexer.INTEGER:
case PainlessLexer.DECIMAL:
case PainlessLexer.ID:
case PainlessLexer.DOTINTEGER:
case PainlessLexer.DOTID:
return false;
default:
return true;
}
}
private static boolean insertSemicolon(Token previous, Token next) { private static boolean insertSemicolon(Token previous, Token next) {
if (previous == null || next.getType() != PainlessLexer.RBRACK) { if (previous == null || next.getType() != PainlessLexer.RBRACK) {
return false; return false;

View File

@ -1,7 +1,6 @@
// ANTLR GENERATED CODE: DO NOT EDIT // ANTLR GENERATED CODE: DO NOT EDIT
package org.elasticsearch.painless.antlr; package org.elasticsearch.painless.antlr;
import org.elasticsearch.painless.Definition;
import org.antlr.v4.runtime.Lexer; import org.antlr.v4.runtime.Lexer;
import org.antlr.v4.runtime.CharStream; import org.antlr.v4.runtime.CharStream;
@ -106,6 +105,14 @@ class PainlessLexer extends Lexer {
} }
protected boolean isSimpleType(String name) {
throw new UnsupportedOperationException("Must be implemented in a subclass");
}
protected boolean slashIsRegex() {
throw new UnsupportedOperationException("Must be implemented in a subclass");
}
public PainlessLexer(CharStream input) { public PainlessLexer(CharStream input) {
super(input); super(input);
_interp = new LexerATNSimulator(this,_ATN,_decisionToDFA,_sharedContextCache); _interp = new LexerATNSimulator(this,_ATN,_decisionToDFA,_sharedContextCache);
@ -141,21 +148,21 @@ class PainlessLexer extends Lexer {
private boolean DIV_sempred(RuleContext _localctx, int predIndex) { private boolean DIV_sempred(RuleContext _localctx, int predIndex) {
switch (predIndex) { switch (predIndex) {
case 0: case 0:
return false == SlashStrategy.slashIsRegex(this) ; return false == slashIsRegex() ;
} }
return true; return true;
} }
private boolean REGEX_sempred(RuleContext _localctx, int predIndex) { private boolean REGEX_sempred(RuleContext _localctx, int predIndex) {
switch (predIndex) { switch (predIndex) {
case 1: case 1:
return SlashStrategy.slashIsRegex(this) ; return slashIsRegex() ;
} }
return true; return true;
} }
private boolean TYPE_sempred(RuleContext _localctx, int predIndex) { private boolean TYPE_sempred(RuleContext _localctx, int predIndex) {
switch (predIndex) { switch (predIndex) {
case 2: case 2:
return Definition.isSimpleType(getText()) ; return isSimpleType(getText()) ;
} }
return true; return true;
} }

View File

@ -1,49 +0,0 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.painless.antlr;
import org.antlr.v4.runtime.Token;
/**
* Utility to figure out if a {@code /} is division or the start of a regex literal.
*/
public class SlashStrategy {
public static boolean slashIsRegex(PainlessLexer lexer) {
EnhancedPainlessLexer realLexer = (EnhancedPainlessLexer) lexer;
Token lastToken = realLexer.getPreviousToken();
if (lastToken == null) {
return true;
}
switch (lastToken.getType()) {
case PainlessLexer.RBRACE:
case PainlessLexer.RP:
case PainlessLexer.OCTAL:
case PainlessLexer.HEX:
case PainlessLexer.INTEGER:
case PainlessLexer.DECIMAL:
case PainlessLexer.ID:
case PainlessLexer.DOTINTEGER:
case PainlessLexer.DOTID:
return false;
default:
return true;
}
}
}