Mechanical changes to make PainlessLexer a bit more obvious (#22695)

Makes `PainlessLexer` abstract and adds the hacks that it needs as abstract methods implemented in `EnhancedPainlessLexer`. This feels a little cleaner than referencing the hacks statically.
2025-02-25 14:26:27 +00:00 · 2017-01-19 17:31:12 -05:00 · 2017-01-19 17:31:12 -05:00 · 296f4aac7a
commit 296f4aac7a
parent 4eb32e9d86 22f1c9fa0f
7 changed files with 120 additions and 63 deletions
--- a/modules/lang-painless/ant.xml
+++ b/modules/lang-painless/ant.xml
@ -1,7 +1,7 @@
 <?xml version="1.0"?>
 <project name="ant-stuff">
-<!-- 
+<!--
 grammar regeneration logic
 we do this with ant for several reasons:
 * remove generated tabs for forbidden-apis
@ -9,7 +9,7 @@
 * fix CRLF line endings for windows consistency
 * ability to make classes package-private
 * keeping in source code control is easier on IDEs
- * regeneration should be rare, no reason to be religious about generated files 
+ * regeneration should be rare, no reason to be religious about generated files
 * all logic already written and battle tested in lucene build
 -->
  <target name="regenerate" description="Regenerate antlr lexer and parser" depends="run-antlr"/>
@ -136,6 +136,10 @@
      <replaceregexp match="public ((interface|class) \Q@{grammar}\E\w+)" replace="\1" encoding="UTF-8">
        <fileset refid="grammar.fileset"/>
      </replaceregexp>
      <!-- make the lexer abstract -->
      <replaceregexp match="(class \Q@{grammar}\ELexer)" replace="abstract \1" encoding="UTF-8">
        <fileset refid="grammar.fileset"/>
      </replaceregexp>
      <!-- nuke timestamps/filenames in generated files -->
      <replaceregexp match="\Q// Generated from \E.*" replace="\/\/ ANTLR GENERATED CODE: DO NOT EDIT" encoding="UTF-8">
        <fileset refid="grammar.fileset"/>
--- a/modules/lang-painless/src/main/antlr/PainlessLexer.g4
+++ b/modules/lang-painless/src/main/antlr/PainlessLexer.g4
@ -19,8 +19,20 @@
 lexer grammar PainlessLexer;
-@header {
+@members{
-import org.elasticsearch.painless.Definition;
+/**
 * Check against the current whitelist to determine whether a token is a type
 * or not. Called by the {@code TYPE} token defined in {@code PainlessLexer.g4}.
 * See also
 * <a href="https://en.wikipedia.org/wiki/The_lexer_hack">The lexer hack</a>.
 */
 protected abstract boolean isSimpleType(String name);
 /**
 * Is the preceding {@code /} a the beginning of a regex (true) or a division
 * (false).
 */
 protected abstract boolean slashIsRegex();
 }
 WS: [ \t\n\r]+ -> skip;
@ -59,7 +71,7 @@ INSTANCEOF: 'instanceof';
 BOOLNOT: '!';
 BWNOT:   '~';
 MUL:     '*';
-DIV:     '/' { false == SlashStrategy.slashIsRegex(this) }?;
+DIV:     '/' { false == slashIsRegex() }?;
 REM:     '%';
 ADD:     '+';
 SUB:     '-';
@ -108,7 +120,7 @@ INTEGER: ( '0' | [1-9] [0-9]* ) [lLfFdD]?;
 DECIMAL: ( '0' | [1-9] [0-9]* ) (DOT [0-9]+)? ( [eE] [+\-]? [0-9]+ )? [fFdD]?;
 STRING: ( '"' ( '\\"' | '\\\\' | ~[\\"] )*? '"' ) | ( '\'' ( '\\\'' | '\\\\' | ~[\\'] )*? '\'' );
-REGEX: '/' ( ~('/' | '\n') | '\\' ~'\n' )+ '/' [cilmsUux]* { SlashStrategy.slashIsRegex(this) }?;
+REGEX: '/' ( ~('/' | '\n') | '\\' ~'\n' )+ '/' [cilmsUux]* { slashIsRegex() }?;
 TRUE:  'true';
 FALSE: 'false';
@ -121,7 +133,7 @@ NULL: 'null';
 // or not.  Note this works by processing one character at a time
 // and the rule is added or removed as this happens.  This is also known
 // as "the lexer hack."  See (https://en.wikipedia.org/wiki/The_lexer_hack).
-TYPE: ID ( DOT ID )* { Definition.isSimpleType(getText()) }?;
+TYPE: ID ( DOT ID )* { isSimpleType(getText()) }?;
 ID: [_a-zA-Z] [_a-zA-Z0-9]*;
 mode AFTER_DOT;
--- a/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/EnhancedPainlessLexer.java
+++ b/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/EnhancedPainlessLexer.java
@ -26,13 +26,15 @@ import org.antlr.v4.runtime.Token;
 import org.antlr.v4.runtime.TokenSource;
 import org.antlr.v4.runtime.misc.Interval;
 import org.antlr.v4.runtime.misc.Pair;
 import org.elasticsearch.painless.Definition;
 import org.elasticsearch.painless.Location;
 /**
 * A lexer that is customized for painless. It:
 * <ul>
- * <li>Overrides the default error behavior to fail on the first error
+ * <li>Overrides the default error behavior to fail on the first error.
- * <li>Stores the last token in case we need to do lookbehind for semicolon insertion and regex vs division detection
+ * <li>Stores the last token in case we need to do lookbehind for semicolon insertion and regex vs division detection.
 * <li>Implements the regex vs division detection.
 * <li>Insert semicolons where they'd improve the language's readability. Rather than hack this into the parser and create a ton of
 * ambiguity we hack them here where we can use heuristics to do it quickly.
 * <li>Enhances the error message when a string contains invalid escape sequences to include a list of valid escape sequences.
@ -89,6 +91,33 @@ final class EnhancedPainlessLexer extends PainlessLexer {
        throw location.createError(new IllegalArgumentException(message, lnvae));
    }
    @Override
    protected boolean isSimpleType(String name) {
        return Definition.isSimpleType(name);
    }
    @Override
    protected boolean slashIsRegex() {
        Token lastToken = getPreviousToken();
        if (lastToken == null) {
            return true;
        }
        switch (lastToken.getType()) {
        case PainlessLexer.RBRACE:
        case PainlessLexer.RP:
        case PainlessLexer.OCTAL:
        case PainlessLexer.HEX:
        case PainlessLexer.INTEGER:
        case PainlessLexer.DECIMAL:
        case PainlessLexer.ID:
        case PainlessLexer.DOTINTEGER:
        case PainlessLexer.DOTID:
            return false;
        default:
            return true;
        }
    }
    private static boolean insertSemicolon(Token previous, Token next) {
        if (previous == null || next.getType() != PainlessLexer.RBRACK) {
            return false;
--- a/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/PainlessLexer.java
+++ b/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/PainlessLexer.java
@ -1,7 +1,6 @@
 // ANTLR GENERATED CODE: DO NOT EDIT
 package org.elasticsearch.painless.antlr;
 import org.elasticsearch.painless.Definition;
 import org.antlr.v4.runtime.Lexer;
 import org.antlr.v4.runtime.CharStream;
@ -13,7 +12,7 @@ import org.antlr.v4.runtime.dfa.DFA;
 import org.antlr.v4.runtime.misc.*;
@SuppressWarnings({"all", "warnings", "unchecked", "unused", "cast"})
-class PainlessLexer extends Lexer {
+abstract class PainlessLexer extends Lexer {
  static { RuntimeMetaData.checkVersion("4.5.1", RuntimeMetaData.VERSION); }
  protected static final DFA[] _decisionToDFA;
@ -106,6 +105,21 @@ class PainlessLexer extends Lexer {
  }
  /**
   * Check against the current whitelist to determine whether a token is a type
   * or not. Called by the {@code TYPE} token defined in {@code PainlessLexer.g4}.
   * See also
   * <a href="https://en.wikipedia.org/wiki/The_lexer_hack">The lexer hack</a>.
   */
  protected abstract boolean isSimpleType(String name);
  /**
   * Is the preceding {@code /} a the beginning of a regex (true) or a division
   * (false).
   */
  protected abstract boolean slashIsRegex();
  public PainlessLexer(CharStream input) {
    super(input);
    _interp = new LexerATNSimulator(this,_ATN,_decisionToDFA,_sharedContextCache);
@ -141,21 +155,21 @@ class PainlessLexer extends Lexer {
  private boolean DIV_sempred(RuleContext _localctx, int predIndex) {
    switch (predIndex) {
    case 0:
-      return  false == SlashStrategy.slashIsRegex(this) ;
+      return  false == slashIsRegex() ;
    }
    return true;
  }
  private boolean REGEX_sempred(RuleContext _localctx, int predIndex) {
    switch (predIndex) {
    case 1:
-      return  SlashStrategy.slashIsRegex(this) ;
+      return  slashIsRegex() ;
    }
    return true;
  }
  private boolean TYPE_sempred(RuleContext _localctx, int predIndex) {
    switch (predIndex) {
    case 2:
-      return  Definition.isSimpleType(getText()) ;
+      return  isSimpleType(getText()) ;
    }
    return true;
  }
--- a/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/SlashStrategy.java
+++ b/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/SlashStrategy.java
@ -1,49 +0,0 @@
 /*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.painless.antlr;
 import org.antlr.v4.runtime.Token;
 /**
 * Utility to figure out if a {@code /} is division or the start of a regex literal.
 */
 public class SlashStrategy {
    public static boolean slashIsRegex(PainlessLexer lexer) {
        EnhancedPainlessLexer realLexer = (EnhancedPainlessLexer) lexer;
        Token lastToken = realLexer.getPreviousToken();
        if (lastToken == null) {
            return true;
        }
        switch (lastToken.getType()) {
        case PainlessLexer.RBRACE:
        case PainlessLexer.RP:
        case PainlessLexer.OCTAL:
        case PainlessLexer.HEX:
        case PainlessLexer.INTEGER:
        case PainlessLexer.DECIMAL:
        case PainlessLexer.ID:
        case PainlessLexer.DOTINTEGER:
        case PainlessLexer.DOTID:
            return false;
        default:
            return true;
        }
    }
 }
--- a/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/package-info.java
+++ b/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/package-info.java
@ -0,0 +1,24 @@
 /*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 /**
 * Lexer, parser, and tree {@link Walker} responsible for turning the code
 * generating nodes in {@link org.elasticsearch.painless.node}.
 */
 package org.elasticsearch.painless.antlr;
--- a/modules/lang-painless/src/main/java/org/elasticsearch/painless/package-info.java
+++ b/modules/lang-painless/src/main/java/org/elasticsearch/painless/package-info.java
@ -0,0 +1,23 @@
 /*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 /**
 * Implementation of the Painless language.
 */
 package org.elasticsearch.painless;