mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-02-25 14:26:27 +00:00
Mechanical changes to make PainlessLexer a bit more obvious (#22695)
Makes `PainlessLexer` abstract and adds the hacks that it needs as abstract methods implemented in `EnhancedPainlessLexer`. This feels a little cleaner than referencing the hacks statically.
This commit is contained in:
commit
296f4aac7a
@ -1,7 +1,7 @@
|
|||||||
<?xml version="1.0"?>
|
<?xml version="1.0"?>
|
||||||
<project name="ant-stuff">
|
<project name="ant-stuff">
|
||||||
|
|
||||||
<!--
|
<!--
|
||||||
grammar regeneration logic
|
grammar regeneration logic
|
||||||
we do this with ant for several reasons:
|
we do this with ant for several reasons:
|
||||||
* remove generated tabs for forbidden-apis
|
* remove generated tabs for forbidden-apis
|
||||||
@ -9,7 +9,7 @@
|
|||||||
* fix CRLF line endings for windows consistency
|
* fix CRLF line endings for windows consistency
|
||||||
* ability to make classes package-private
|
* ability to make classes package-private
|
||||||
* keeping in source code control is easier on IDEs
|
* keeping in source code control is easier on IDEs
|
||||||
* regeneration should be rare, no reason to be religious about generated files
|
* regeneration should be rare, no reason to be religious about generated files
|
||||||
* all logic already written and battle tested in lucene build
|
* all logic already written and battle tested in lucene build
|
||||||
-->
|
-->
|
||||||
<target name="regenerate" description="Regenerate antlr lexer and parser" depends="run-antlr"/>
|
<target name="regenerate" description="Regenerate antlr lexer and parser" depends="run-antlr"/>
|
||||||
@ -136,6 +136,10 @@
|
|||||||
<replaceregexp match="public ((interface|class) \Q@{grammar}\E\w+)" replace="\1" encoding="UTF-8">
|
<replaceregexp match="public ((interface|class) \Q@{grammar}\E\w+)" replace="\1" encoding="UTF-8">
|
||||||
<fileset refid="grammar.fileset"/>
|
<fileset refid="grammar.fileset"/>
|
||||||
</replaceregexp>
|
</replaceregexp>
|
||||||
|
<!-- make the lexer abstract -->
|
||||||
|
<replaceregexp match="(class \Q@{grammar}\ELexer)" replace="abstract \1" encoding="UTF-8">
|
||||||
|
<fileset refid="grammar.fileset"/>
|
||||||
|
</replaceregexp>
|
||||||
<!-- nuke timestamps/filenames in generated files -->
|
<!-- nuke timestamps/filenames in generated files -->
|
||||||
<replaceregexp match="\Q// Generated from \E.*" replace="\/\/ ANTLR GENERATED CODE: DO NOT EDIT" encoding="UTF-8">
|
<replaceregexp match="\Q// Generated from \E.*" replace="\/\/ ANTLR GENERATED CODE: DO NOT EDIT" encoding="UTF-8">
|
||||||
<fileset refid="grammar.fileset"/>
|
<fileset refid="grammar.fileset"/>
|
||||||
|
@ -19,8 +19,20 @@
|
|||||||
|
|
||||||
lexer grammar PainlessLexer;
|
lexer grammar PainlessLexer;
|
||||||
|
|
||||||
@header {
|
@members{
|
||||||
import org.elasticsearch.painless.Definition;
|
/**
|
||||||
|
* Check against the current whitelist to determine whether a token is a type
|
||||||
|
* or not. Called by the {@code TYPE} token defined in {@code PainlessLexer.g4}.
|
||||||
|
* See also
|
||||||
|
* <a href="https://en.wikipedia.org/wiki/The_lexer_hack">The lexer hack</a>.
|
||||||
|
*/
|
||||||
|
protected abstract boolean isSimpleType(String name);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Is the preceding {@code /} a the beginning of a regex (true) or a division
|
||||||
|
* (false).
|
||||||
|
*/
|
||||||
|
protected abstract boolean slashIsRegex();
|
||||||
}
|
}
|
||||||
|
|
||||||
WS: [ \t\n\r]+ -> skip;
|
WS: [ \t\n\r]+ -> skip;
|
||||||
@ -59,7 +71,7 @@ INSTANCEOF: 'instanceof';
|
|||||||
BOOLNOT: '!';
|
BOOLNOT: '!';
|
||||||
BWNOT: '~';
|
BWNOT: '~';
|
||||||
MUL: '*';
|
MUL: '*';
|
||||||
DIV: '/' { false == SlashStrategy.slashIsRegex(this) }?;
|
DIV: '/' { false == slashIsRegex() }?;
|
||||||
REM: '%';
|
REM: '%';
|
||||||
ADD: '+';
|
ADD: '+';
|
||||||
SUB: '-';
|
SUB: '-';
|
||||||
@ -108,7 +120,7 @@ INTEGER: ( '0' | [1-9] [0-9]* ) [lLfFdD]?;
|
|||||||
DECIMAL: ( '0' | [1-9] [0-9]* ) (DOT [0-9]+)? ( [eE] [+\-]? [0-9]+ )? [fFdD]?;
|
DECIMAL: ( '0' | [1-9] [0-9]* ) (DOT [0-9]+)? ( [eE] [+\-]? [0-9]+ )? [fFdD]?;
|
||||||
|
|
||||||
STRING: ( '"' ( '\\"' | '\\\\' | ~[\\"] )*? '"' ) | ( '\'' ( '\\\'' | '\\\\' | ~[\\'] )*? '\'' );
|
STRING: ( '"' ( '\\"' | '\\\\' | ~[\\"] )*? '"' ) | ( '\'' ( '\\\'' | '\\\\' | ~[\\'] )*? '\'' );
|
||||||
REGEX: '/' ( ~('/' | '\n') | '\\' ~'\n' )+ '/' [cilmsUux]* { SlashStrategy.slashIsRegex(this) }?;
|
REGEX: '/' ( ~('/' | '\n') | '\\' ~'\n' )+ '/' [cilmsUux]* { slashIsRegex() }?;
|
||||||
|
|
||||||
TRUE: 'true';
|
TRUE: 'true';
|
||||||
FALSE: 'false';
|
FALSE: 'false';
|
||||||
@ -121,7 +133,7 @@ NULL: 'null';
|
|||||||
// or not. Note this works by processing one character at a time
|
// or not. Note this works by processing one character at a time
|
||||||
// and the rule is added or removed as this happens. This is also known
|
// and the rule is added or removed as this happens. This is also known
|
||||||
// as "the lexer hack." See (https://en.wikipedia.org/wiki/The_lexer_hack).
|
// as "the lexer hack." See (https://en.wikipedia.org/wiki/The_lexer_hack).
|
||||||
TYPE: ID ( DOT ID )* { Definition.isSimpleType(getText()) }?;
|
TYPE: ID ( DOT ID )* { isSimpleType(getText()) }?;
|
||||||
ID: [_a-zA-Z] [_a-zA-Z0-9]*;
|
ID: [_a-zA-Z] [_a-zA-Z0-9]*;
|
||||||
|
|
||||||
mode AFTER_DOT;
|
mode AFTER_DOT;
|
||||||
|
@ -26,13 +26,15 @@ import org.antlr.v4.runtime.Token;
|
|||||||
import org.antlr.v4.runtime.TokenSource;
|
import org.antlr.v4.runtime.TokenSource;
|
||||||
import org.antlr.v4.runtime.misc.Interval;
|
import org.antlr.v4.runtime.misc.Interval;
|
||||||
import org.antlr.v4.runtime.misc.Pair;
|
import org.antlr.v4.runtime.misc.Pair;
|
||||||
|
import org.elasticsearch.painless.Definition;
|
||||||
import org.elasticsearch.painless.Location;
|
import org.elasticsearch.painless.Location;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A lexer that is customized for painless. It:
|
* A lexer that is customized for painless. It:
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li>Overrides the default error behavior to fail on the first error
|
* <li>Overrides the default error behavior to fail on the first error.
|
||||||
* <li>Stores the last token in case we need to do lookbehind for semicolon insertion and regex vs division detection
|
* <li>Stores the last token in case we need to do lookbehind for semicolon insertion and regex vs division detection.
|
||||||
|
* <li>Implements the regex vs division detection.
|
||||||
* <li>Insert semicolons where they'd improve the language's readability. Rather than hack this into the parser and create a ton of
|
* <li>Insert semicolons where they'd improve the language's readability. Rather than hack this into the parser and create a ton of
|
||||||
* ambiguity we hack them here where we can use heuristics to do it quickly.
|
* ambiguity we hack them here where we can use heuristics to do it quickly.
|
||||||
* <li>Enhances the error message when a string contains invalid escape sequences to include a list of valid escape sequences.
|
* <li>Enhances the error message when a string contains invalid escape sequences to include a list of valid escape sequences.
|
||||||
@ -89,6 +91,33 @@ final class EnhancedPainlessLexer extends PainlessLexer {
|
|||||||
throw location.createError(new IllegalArgumentException(message, lnvae));
|
throw location.createError(new IllegalArgumentException(message, lnvae));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected boolean isSimpleType(String name) {
|
||||||
|
return Definition.isSimpleType(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected boolean slashIsRegex() {
|
||||||
|
Token lastToken = getPreviousToken();
|
||||||
|
if (lastToken == null) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
switch (lastToken.getType()) {
|
||||||
|
case PainlessLexer.RBRACE:
|
||||||
|
case PainlessLexer.RP:
|
||||||
|
case PainlessLexer.OCTAL:
|
||||||
|
case PainlessLexer.HEX:
|
||||||
|
case PainlessLexer.INTEGER:
|
||||||
|
case PainlessLexer.DECIMAL:
|
||||||
|
case PainlessLexer.ID:
|
||||||
|
case PainlessLexer.DOTINTEGER:
|
||||||
|
case PainlessLexer.DOTID:
|
||||||
|
return false;
|
||||||
|
default:
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static boolean insertSemicolon(Token previous, Token next) {
|
private static boolean insertSemicolon(Token previous, Token next) {
|
||||||
if (previous == null || next.getType() != PainlessLexer.RBRACK) {
|
if (previous == null || next.getType() != PainlessLexer.RBRACK) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
// ANTLR GENERATED CODE: DO NOT EDIT
|
// ANTLR GENERATED CODE: DO NOT EDIT
|
||||||
package org.elasticsearch.painless.antlr;
|
package org.elasticsearch.painless.antlr;
|
||||||
|
|
||||||
import org.elasticsearch.painless.Definition;
|
|
||||||
|
|
||||||
import org.antlr.v4.runtime.Lexer;
|
import org.antlr.v4.runtime.Lexer;
|
||||||
import org.antlr.v4.runtime.CharStream;
|
import org.antlr.v4.runtime.CharStream;
|
||||||
@ -13,7 +12,7 @@ import org.antlr.v4.runtime.dfa.DFA;
|
|||||||
import org.antlr.v4.runtime.misc.*;
|
import org.antlr.v4.runtime.misc.*;
|
||||||
|
|
||||||
@SuppressWarnings({"all", "warnings", "unchecked", "unused", "cast"})
|
@SuppressWarnings({"all", "warnings", "unchecked", "unused", "cast"})
|
||||||
class PainlessLexer extends Lexer {
|
abstract class PainlessLexer extends Lexer {
|
||||||
static { RuntimeMetaData.checkVersion("4.5.1", RuntimeMetaData.VERSION); }
|
static { RuntimeMetaData.checkVersion("4.5.1", RuntimeMetaData.VERSION); }
|
||||||
|
|
||||||
protected static final DFA[] _decisionToDFA;
|
protected static final DFA[] _decisionToDFA;
|
||||||
@ -106,6 +105,21 @@ class PainlessLexer extends Lexer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check against the current whitelist to determine whether a token is a type
|
||||||
|
* or not. Called by the {@code TYPE} token defined in {@code PainlessLexer.g4}.
|
||||||
|
* See also
|
||||||
|
* <a href="https://en.wikipedia.org/wiki/The_lexer_hack">The lexer hack</a>.
|
||||||
|
*/
|
||||||
|
protected abstract boolean isSimpleType(String name);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Is the preceding {@code /} a the beginning of a regex (true) or a division
|
||||||
|
* (false).
|
||||||
|
*/
|
||||||
|
protected abstract boolean slashIsRegex();
|
||||||
|
|
||||||
|
|
||||||
public PainlessLexer(CharStream input) {
|
public PainlessLexer(CharStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
_interp = new LexerATNSimulator(this,_ATN,_decisionToDFA,_sharedContextCache);
|
_interp = new LexerATNSimulator(this,_ATN,_decisionToDFA,_sharedContextCache);
|
||||||
@ -141,21 +155,21 @@ class PainlessLexer extends Lexer {
|
|||||||
private boolean DIV_sempred(RuleContext _localctx, int predIndex) {
|
private boolean DIV_sempred(RuleContext _localctx, int predIndex) {
|
||||||
switch (predIndex) {
|
switch (predIndex) {
|
||||||
case 0:
|
case 0:
|
||||||
return false == SlashStrategy.slashIsRegex(this) ;
|
return false == slashIsRegex() ;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
private boolean REGEX_sempred(RuleContext _localctx, int predIndex) {
|
private boolean REGEX_sempred(RuleContext _localctx, int predIndex) {
|
||||||
switch (predIndex) {
|
switch (predIndex) {
|
||||||
case 1:
|
case 1:
|
||||||
return SlashStrategy.slashIsRegex(this) ;
|
return slashIsRegex() ;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
private boolean TYPE_sempred(RuleContext _localctx, int predIndex) {
|
private boolean TYPE_sempred(RuleContext _localctx, int predIndex) {
|
||||||
switch (predIndex) {
|
switch (predIndex) {
|
||||||
case 2:
|
case 2:
|
||||||
return Definition.isSimpleType(getText()) ;
|
return isSimpleType(getText()) ;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -1,49 +0,0 @@
|
|||||||
/*
|
|
||||||
* Licensed to Elasticsearch under one or more contributor
|
|
||||||
* license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright
|
|
||||||
* ownership. Elasticsearch licenses this file to you under
|
|
||||||
* the Apache License, Version 2.0 (the "License"); you may
|
|
||||||
* not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing,
|
|
||||||
* software distributed under the License is distributed on an
|
|
||||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
||||||
* KIND, either express or implied. See the License for the
|
|
||||||
* specific language governing permissions and limitations
|
|
||||||
* under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.elasticsearch.painless.antlr;
|
|
||||||
|
|
||||||
import org.antlr.v4.runtime.Token;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Utility to figure out if a {@code /} is division or the start of a regex literal.
|
|
||||||
*/
|
|
||||||
public class SlashStrategy {
|
|
||||||
public static boolean slashIsRegex(PainlessLexer lexer) {
|
|
||||||
EnhancedPainlessLexer realLexer = (EnhancedPainlessLexer) lexer;
|
|
||||||
Token lastToken = realLexer.getPreviousToken();
|
|
||||||
if (lastToken == null) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
switch (lastToken.getType()) {
|
|
||||||
case PainlessLexer.RBRACE:
|
|
||||||
case PainlessLexer.RP:
|
|
||||||
case PainlessLexer.OCTAL:
|
|
||||||
case PainlessLexer.HEX:
|
|
||||||
case PainlessLexer.INTEGER:
|
|
||||||
case PainlessLexer.DECIMAL:
|
|
||||||
case PainlessLexer.ID:
|
|
||||||
case PainlessLexer.DOTINTEGER:
|
|
||||||
case PainlessLexer.DOTID:
|
|
||||||
return false;
|
|
||||||
default:
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -0,0 +1,24 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lexer, parser, and tree {@link Walker} responsible for turning the code
|
||||||
|
* generating nodes in {@link org.elasticsearch.painless.node}.
|
||||||
|
*/
|
||||||
|
package org.elasticsearch.painless.antlr;
|
@ -0,0 +1,23 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Implementation of the Painless language.
|
||||||
|
*/
|
||||||
|
package org.elasticsearch.painless;
|
Loading…
x
Reference in New Issue
Block a user