Mechanical changes to make PainlessLexer a bit more obvious (#22695)

Makes `PainlessLexer` abstract and adds the hacks that it needs as abstract methods implemented in `EnhancedPainlessLexer`. This feels a little cleaner than referencing the hacks statically.
2017-01-19 17:31:12 -05:00 · 2017-01-19 17:31:12 -05:00 · 296f4aac7a
parent 4eb32e9d86 22f1c9fa0f
commit 296f4aac7a
7 changed files with 120 additions and 63 deletions
--- a/modules/lang-painless/ant.xml
+++ b/modules/lang-painless/ant.xml
@ -1,7 +1,7 @@
 <?xml version="1.0"?>
 <project name="ant-stuff">

-<!-- 
+<!--
 grammar regeneration logic
 we do this with ant for several reasons:
 * remove generated tabs for forbidden-apis
@ -9,7 +9,7 @@
 * fix CRLF line endings for windows consistency
 * ability to make classes package-private
 * keeping in source code control is easier on IDEs
- * regeneration should be rare, no reason to be religious about generated files 
+ * regeneration should be rare, no reason to be religious about generated files
 * all logic already written and battle tested in lucene build
 -->
  <target name="regenerate" description="Regenerate antlr lexer and parser" depends="run-antlr"/>
@ -136,6 +136,10 @@
      <replaceregexp match="public ((interface|class) \Q@{grammar}\E\w+)" replace="\1" encoding="UTF-8">
        <fileset refid="grammar.fileset"/>
      </replaceregexp>
+      <!-- make the lexer abstract -->
+      <replaceregexp match="(class \Q@{grammar}\ELexer)" replace="abstract \1" encoding="UTF-8">
+        <fileset refid="grammar.fileset"/>
+      </replaceregexp>
      <!-- nuke timestamps/filenames in generated files -->
      <replaceregexp match="\Q// Generated from \E.*" replace="\/\/ ANTLR GENERATED CODE: DO NOT EDIT" encoding="UTF-8">
        <fileset refid="grammar.fileset"/>
--- a/modules/lang-painless/src/main/antlr/PainlessLexer.g4
+++ b/modules/lang-painless/src/main/antlr/PainlessLexer.g4
@ -19,8 +19,20 @@

 lexer grammar PainlessLexer;

-@header {
-import org.elasticsearch.painless.Definition;
+@members{
+/**
+ * Check against the current whitelist to determine whether a token is a type
+ * or not. Called by the {@code TYPE} token defined in {@code PainlessLexer.g4}.
+ * See also
+ * <a href="https://en.wikipedia.org/wiki/The_lexer_hack">The lexer hack</a>.
+ */
+protected abstract boolean isSimpleType(String name);
+
+/**
+ * Is the preceding {@code /} a the beginning of a regex (true) or a division
+ * (false).
+ */
+protected abstract boolean slashIsRegex();
 }

 WS: [ \t\n\r]+ -> skip;
@ -59,7 +71,7 @@ INSTANCEOF: 'instanceof';
 BOOLNOT: '!';
 BWNOT:   '~';
 MUL:     '*';
-DIV:     '/' { false == SlashStrategy.slashIsRegex(this) }?;
+DIV:     '/' { false == slashIsRegex() }?;
 REM:     '%';
 ADD:     '+';
 SUB:     '-';
@ -108,7 +120,7 @@ INTEGER: ( '0' | [1-9] [0-9]* ) [lLfFdD]?;
 DECIMAL: ( '0' | [1-9] [0-9]* ) (DOT [0-9]+)? ( [eE] [+\-]? [0-9]+ )? [fFdD]?;

 STRING: ( '"' ( '\\"' | '\\\\' | ~[\\"] )*? '"' ) | ( '\'' ( '\\\'' | '\\\\' | ~[\\'] )*? '\'' );
-REGEX: '/' ( ~('/' | '\n') | '\\' ~'\n' )+ '/' [cilmsUux]* { SlashStrategy.slashIsRegex(this) }?;
+REGEX: '/' ( ~('/' | '\n') | '\\' ~'\n' )+ '/' [cilmsUux]* { slashIsRegex() }?;

 TRUE:  'true';
 FALSE: 'false';
@ -121,7 +133,7 @@ NULL: 'null';
 // or not.  Note this works by processing one character at a time
 // and the rule is added or removed as this happens.  This is also known
 // as "the lexer hack."  See (https://en.wikipedia.org/wiki/The_lexer_hack).
-TYPE: ID ( DOT ID )* { Definition.isSimpleType(getText()) }?;
+TYPE: ID ( DOT ID )* { isSimpleType(getText()) }?;
 ID: [_a-zA-Z] [_a-zA-Z0-9]*;

 mode AFTER_DOT;
--- a/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/EnhancedPainlessLexer.java
+++ b/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/EnhancedPainlessLexer.java
@ -26,13 +26,15 @@ import org.antlr.v4.runtime.Token;
 import org.antlr.v4.runtime.TokenSource;
 import org.antlr.v4.runtime.misc.Interval;
 import org.antlr.v4.runtime.misc.Pair;
+import org.elasticsearch.painless.Definition;
 import org.elasticsearch.painless.Location;

 /**
 * A lexer that is customized for painless. It:
 * <ul>
- * <li>Overrides the default error behavior to fail on the first error
- * <li>Stores the last token in case we need to do lookbehind for semicolon insertion and regex vs division detection
+ * <li>Overrides the default error behavior to fail on the first error.
+ * <li>Stores the last token in case we need to do lookbehind for semicolon insertion and regex vs division detection.
+ * <li>Implements the regex vs division detection.
 * <li>Insert semicolons where they'd improve the language's readability. Rather than hack this into the parser and create a ton of
 * ambiguity we hack them here where we can use heuristics to do it quickly.
 * <li>Enhances the error message when a string contains invalid escape sequences to include a list of valid escape sequences.
@ -89,6 +91,33 @@ final class EnhancedPainlessLexer extends PainlessLexer {
        throw location.createError(new IllegalArgumentException(message, lnvae));
    }

+    @Override
+    protected boolean isSimpleType(String name) {
+        return Definition.isSimpleType(name);
+    }
+
+    @Override
+    protected boolean slashIsRegex() {
+        Token lastToken = getPreviousToken();
+        if (lastToken == null) {
+            return true;
+        }
+        switch (lastToken.getType()) {
+        case PainlessLexer.RBRACE:
+        case PainlessLexer.RP:
+        case PainlessLexer.OCTAL:
+        case PainlessLexer.HEX:
+        case PainlessLexer.INTEGER:
+        case PainlessLexer.DECIMAL:
+        case PainlessLexer.ID:
+        case PainlessLexer.DOTINTEGER:
+        case PainlessLexer.DOTID:
+            return false;
+        default:
+            return true;
+        }
+    }
+
    private static boolean insertSemicolon(Token previous, Token next) {
        if (previous == null || next.getType() != PainlessLexer.RBRACK) {
            return false;
--- a/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/PainlessLexer.java
+++ b/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/PainlessLexer.java
@ -1,7 +1,6 @@
 // ANTLR GENERATED CODE: DO NOT EDIT
 package org.elasticsearch.painless.antlr;

-import org.elasticsearch.painless.Definition;

 import org.antlr.v4.runtime.Lexer;
 import org.antlr.v4.runtime.CharStream;
@ -13,7 +12,7 @@ import org.antlr.v4.runtime.dfa.DFA;
 import org.antlr.v4.runtime.misc.*;

@SuppressWarnings({"all", "warnings", "unchecked", "unused", "cast"})
-class PainlessLexer extends Lexer {
+abstract class PainlessLexer extends Lexer {
  static { RuntimeMetaData.checkVersion("4.5.1", RuntimeMetaData.VERSION); }

  protected static final DFA[] _decisionToDFA;
@ -106,6 +105,21 @@ class PainlessLexer extends Lexer {
  }


+  /**
+   * Check against the current whitelist to determine whether a token is a type
+   * or not. Called by the {@code TYPE} token defined in {@code PainlessLexer.g4}.
+   * See also
+   * <a href="https://en.wikipedia.org/wiki/The_lexer_hack">The lexer hack</a>.
+   */
+  protected abstract boolean isSimpleType(String name);
+
+  /**
+   * Is the preceding {@code /} a the beginning of a regex (true) or a division
+   * (false).
+   */
+  protected abstract boolean slashIsRegex();
+
+
  public PainlessLexer(CharStream input) {
    super(input);
    _interp = new LexerATNSimulator(this,_ATN,_decisionToDFA,_sharedContextCache);
@ -141,21 +155,21 @@ class PainlessLexer extends Lexer {
  private boolean DIV_sempred(RuleContext _localctx, int predIndex) {
    switch (predIndex) {
    case 0:
-      return  false == SlashStrategy.slashIsRegex(this) ;
+      return  false == slashIsRegex() ;
    }
    return true;
  }
  private boolean REGEX_sempred(RuleContext _localctx, int predIndex) {
    switch (predIndex) {
    case 1:
-      return  SlashStrategy.slashIsRegex(this) ;
+      return  slashIsRegex() ;
    }
    return true;
  }
  private boolean TYPE_sempred(RuleContext _localctx, int predIndex) {
    switch (predIndex) {
    case 2:
-      return  Definition.isSimpleType(getText()) ;
+      return  isSimpleType(getText()) ;
    }
    return true;
  }
--- a/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/SlashStrategy.java
+++ b/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/SlashStrategy.java
@ -1,49 +0,0 @@
-/*
- * Licensed to Elasticsearch under one or more contributor
- * license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright
- * ownership. Elasticsearch licenses this file to you under
- * the Apache License, Version 2.0 (the "License"); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.elasticsearch.painless.antlr;
-
-import org.antlr.v4.runtime.Token;
-
-/**
- * Utility to figure out if a {@code /} is division or the start of a regex literal.
- */
-public class SlashStrategy {
-    public static boolean slashIsRegex(PainlessLexer lexer) {
-        EnhancedPainlessLexer realLexer = (EnhancedPainlessLexer) lexer;
-        Token lastToken = realLexer.getPreviousToken();
-        if (lastToken == null) {
-            return true;
-        }
-        switch (lastToken.getType()) {
-        case PainlessLexer.RBRACE:
-        case PainlessLexer.RP:
-        case PainlessLexer.OCTAL:
-        case PainlessLexer.HEX:
-        case PainlessLexer.INTEGER:
-        case PainlessLexer.DECIMAL:
-        case PainlessLexer.ID:
-        case PainlessLexer.DOTINTEGER:
-        case PainlessLexer.DOTID:
-            return false;
-        default:
-            return true;
-        }
-    }
-}
--- a/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/package-info.java
+++ b/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/package-info.java
@ -0,0 +1,24 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * Lexer, parser, and tree {@link Walker} responsible for turning the code
+ * generating nodes in {@link org.elasticsearch.painless.node}.
+ */
+package org.elasticsearch.painless.antlr;
--- a/modules/lang-painless/src/main/java/org/elasticsearch/painless/package-info.java
+++ b/modules/lang-painless/src/main/java/org/elasticsearch/painless/package-info.java
@ -0,0 +1,23 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * Implementation of the Painless language.
+ */
+package org.elasticsearch.painless;