Revert "refactor(compiler): support encoded entity tokens when lexing markup (#42062)" (#43033)

This reverts commit 942b24d5ea5d36ad4e53ed435bda35a6ae6876c9. PR Close #43033
2021-08-03 14:49:05 -07:00 · 2021-08-03 14:49:05 -07:00 · 8d8ab4775c
commit 8d8ab4775c
parent ea5ed4e4d4
3 changed files with 57 additions and 97 deletions
--- a/packages/compiler/src/ml_parser/lexer.ts
+++ b/packages/compiler/src/ml_parser/lexer.ts
@ -23,7 +23,6 @@ export enum TokenType {
  ESCAPABLE_RAW_TEXT,
  RAW_TEXT,
  INTERPOLATION,
-  ENCODED_ENTITY,
  COMMENT_START,
  COMMENT_END,
  CDATA_START,
@ -396,16 +395,19 @@ class _Tokenizer {
    }
  }

-  private _readChar(): string {
-    // Don't rely upon reading directly from `_input` as the actual char value
-    // may have been generated from an escape sequence.
-    const char = String.fromCodePoint(this._cursor.peek());
-    this._cursor.advance();
-    return char;
+  private _readChar(decodeEntities: boolean): string {
+    if (decodeEntities && this._cursor.peek() === chars.$AMPERSAND) {
+      return this._decodeEntity();
+    } else {
+      // Don't rely upon reading directly from `_input` as the actual char value
+      // may have been generated from an escape sequence.
+      const char = String.fromCodePoint(this._cursor.peek());
+      this._cursor.advance();
+      return char;
+    }
  }

-  private _consumeEntity(textTokenType: TokenType): void {
-    this._beginToken(TokenType.ENCODED_ENTITY);
+  private _decodeEntity(): string {
    const start = this._cursor.clone();
    this._cursor.advance();
    if (this._attemptCharCode(chars.$HASH)) {
@ -425,7 +427,7 @@ class _Tokenizer {
      this._cursor.advance();
      try {
        const charCode = parseInt(strNum, isHex ? 16 : 10);
-        this._endToken([String.fromCharCode(charCode), this._cursor.getChars(start)]);
+        return String.fromCharCode(charCode);
      } catch {
        throw this._createError(
            _unknownEntityErrorMsg(this._cursor.getChars(start)), this._cursor.getSpan());
@ -434,25 +436,21 @@ class _Tokenizer {
      const nameStart = this._cursor.clone();
      this._attemptCharCodeUntilFn(isNamedEntityEnd);
      if (this._cursor.peek() != chars.$SEMICOLON) {
-        // No semicolon was found so abort the encoded entity token that was in progress, and treat
-        // this as a text token
-        this._beginToken(textTokenType, start);
        this._cursor = nameStart;
-        this._endToken(['&']);
-      } else {
-        const name = this._cursor.getChars(nameStart);
-        this._cursor.advance();
-        const char = NAMED_ENTITIES[name];
-        if (!char) {
-          throw this._createError(_unknownEntityErrorMsg(name), this._cursor.getSpan(start));
-        }
-        this._endToken([char, `&${name};`]);
+        return '&';
      }
+      const name = this._cursor.getChars(nameStart);
+      this._cursor.advance();
+      const char = NAMED_ENTITIES[name];
+      if (!char) {
+        throw this._createError(_unknownEntityErrorMsg(name), this._cursor.getSpan(start));
+      }
+      return char;
    }
  }

-  private _consumeRawText(consumeEntities: boolean, endMarkerPredicate: () => boolean): void {
-    this._beginToken(consumeEntities ? TokenType.ESCAPABLE_RAW_TEXT : TokenType.RAW_TEXT);
+  private _consumeRawText(decodeEntities: boolean, endMarkerPredicate: () => boolean): Token {
+    this._beginToken(decodeEntities ? TokenType.ESCAPABLE_RAW_TEXT : TokenType.RAW_TEXT);
    const parts: string[] = [];
    while (true) {
      const tagCloseStart = this._cursor.clone();
@ -461,16 +459,9 @@ class _Tokenizer {
      if (foundEndMarker) {
        break;
      }
-      if (consumeEntities && this._cursor.peek() === chars.$AMPERSAND) {
-        this._endToken([this._processCarriageReturns(parts.join(''))]);
-        parts.length = 0;
-        this._consumeEntity(TokenType.ESCAPABLE_RAW_TEXT);
-        this._beginToken(TokenType.ESCAPABLE_RAW_TEXT);
-      } else {
-        parts.push(this._readChar());
-      }
+      parts.push(this._readChar(decodeEntities));
    }
-    this._endToken([this._processCarriageReturns(parts.join(''))]);
+    return this._endToken([this._processCarriageReturns(parts.join(''))]);
  }

  private _consumeComment(start: CharacterCursor) {
@ -572,8 +563,8 @@ class _Tokenizer {
    }
  }

-  private _consumeRawTextWithTagClose(prefix: string, tagName: string, consumeEntities: boolean) {
-    this._consumeRawText(consumeEntities, () => {
+  private _consumeRawTextWithTagClose(prefix: string, tagName: string, decodeEntities: boolean) {
+    this._consumeRawText(decodeEntities, () => {
      if (!this._attemptCharCode(chars.$LT)) return false;
      if (!this._attemptCharCode(chars.$SLASH)) return false;
      this._attemptCharCodeUntilFn(isNotWhitespace);
@ -721,16 +712,11 @@ class _Tokenizer {
      const current = this._cursor.clone();
      if (this._interpolationConfig && this._attemptStr(this._interpolationConfig.start)) {
        this._endToken([this._processCarriageReturns(parts.join(''))], current);
-        parts.length = 0;
        this._consumeInterpolation(interpolationTokenType, current);
-        this._beginToken(textTokenType);
-      } else if (this._cursor.peek() === chars.$AMPERSAND) {
-        this._endToken([this._processCarriageReturns(parts.join(''))]);
        parts.length = 0;
-        this._consumeEntity(textTokenType);
        this._beginToken(textTokenType);
      } else {
-        parts.push(this._readChar());
+        parts.push(this._readChar(true));
      }
    }

@ -909,9 +895,7 @@ function mergeTextTokens(srcTokens: Token[]): Token[] {
  let lastDstToken: Token|undefined = undefined;
  for (let i = 0; i < srcTokens.length; i++) {
    const token = srcTokens[i];
-    if ((lastDstToken && lastDstToken.type == TokenType.TEXT && token.type == TokenType.TEXT) ||
-        (lastDstToken && lastDstToken.type == TokenType.ATTR_VALUE_TEXT &&
-         token.type == TokenType.ATTR_VALUE_TEXT)) {
+    if (lastDstToken && lastDstToken.type == TokenType.TEXT && token.type == TokenType.TEXT) {
      lastDstToken.parts[0]! += token.parts[0];
      lastDstToken.sourceSpan.end = token.sourceSpan.end;
    } else {
--- a/packages/compiler/src/ml_parser/parser.ts
+++ b/packages/compiler/src/ml_parser/parser.ts
@ -226,21 +226,20 @@ class _TreeBuilder {
      }
    }

-    // For now recombine text, interpolation and entity tokens
-    while (this._peek.type === lex.TokenType.INTERPOLATION ||
-           this._peek.type === lex.TokenType.TEXT ||
-           this._peek.type === lex.TokenType.ENCODED_ENTITY) {
-      token = this._advance();
-      if (token.type === lex.TokenType.INTERPOLATION) {
-        // For backward compatibility we decode HTML entities that appear in interpolation
-        // expressions. This is arguably a bug, but it could be a considerable breaking change to
-        // fix it. It should be addressed in a larger project to refactor the entire parser/lexer
-        // chain after View Engine has been removed.
-        text += token.parts.join('').replace(/&([^;]+);/g, decodeEntity);
-      } else if (token.type === lex.TokenType.ENCODED_ENTITY) {
-        text += token.parts[0];
-      } else {
-        text += token.parts.join('');
+    // For now recombine text and interpolation tokens
+    if (this._peek.type === lex.TokenType.INTERPOLATION) {
+      while (this._peek.type === lex.TokenType.INTERPOLATION ||
+             this._peek.type === lex.TokenType.TEXT) {
+        token = this._advance();
+        if (token.type === lex.TokenType.INTERPOLATION) {
+          // For backward compatibility we decode HTML entities that appear in interpolation
+          // expressions. This is arguably a bug, but it could be a considerable breaking change to
+          // fix it. It should be addressed in a larger project to refactor the entire parser/lexer
+          // chain after View Engine has been removed.
+          text += token.parts.join('').replace(/&([^;]+);/g, decodeEntity);
+        } else {
+          text += token.parts.join('');
+        }
      }
    }

@ -370,17 +369,16 @@ class _TreeBuilder {
      this._advance();
    }

-    // Consume the attribute value
+    // Consume the value
    let value = '';
    let valueStartSpan: ParseSourceSpan|undefined = undefined;
    let valueEnd: ParseLocation|undefined = undefined;
    if (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT) {
      valueStartSpan = this._peek.sourceSpan;
      valueEnd = this._peek.sourceSpan.end;
-      // For now recombine text, interpolation and entity tokens
+      // For now we are recombining text and interpolation tokens
      while (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT ||
-             this._peek.type === lex.TokenType.ATTR_VALUE_INTERPOLATION ||
-             this._peek.type === lex.TokenType.ENCODED_ENTITY) {
+             this._peek.type === lex.TokenType.ATTR_VALUE_INTERPOLATION) {
        let valueToken = this._advance();
        if (valueToken.type === lex.TokenType.ATTR_VALUE_INTERPOLATION) {
          // For backward compatibility we decode HTML entities that appear in interpolation
@ -388,8 +386,6 @@ class _TreeBuilder {
          // fix it. It should be addressed in a larger project to refactor the entire parser/lexer
          // chain after View Engine has been removed.
          value += valueToken.parts.join('').replace(/&([^;]+);/g, decodeEntity);
-        } else if (valueToken.type === lex.TokenType.ENCODED_ENTITY) {
-          value += valueToken.parts[0];
        } else {
          value += valueToken.parts.join('');
        }
--- a/packages/compiler/test/ml_parser/lexer_spec.ts
+++ b/packages/compiler/test/ml_parser/lexer_spec.ts
@ -407,11 +407,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
          [lex.TokenType.TAG_OPEN_START, '', 't'],
          [lex.TokenType.ATTR_NAME, '', 'a'],
          [lex.TokenType.ATTR_QUOTE, '"'],
-          [lex.TokenType.ATTR_VALUE_TEXT, ''],
-          [lex.TokenType.ENCODED_ENTITY, 'A', '&#65;'],
-          [lex.TokenType.ATTR_VALUE_TEXT, ''],
-          [lex.TokenType.ENCODED_ENTITY, 'A', '&#x41;'],
-          [lex.TokenType.ATTR_VALUE_TEXT, ''],
+          [lex.TokenType.ATTR_VALUE_TEXT, 'AA'],
          [lex.TokenType.ATTR_QUOTE, '"'],
          [lex.TokenType.TAG_OPEN_END],
          [lex.TokenType.EOF],
@ -526,60 +522,50 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
    describe('entities', () => {
      it('should parse named entities', () => {
        expect(tokenizeAndHumanizeParts('a&amp;b')).toEqual([
-          [lex.TokenType.TEXT, 'a'],
-          [lex.TokenType.ENCODED_ENTITY, '&', '&amp;'],
-          [lex.TokenType.TEXT, 'b'],
+          [lex.TokenType.TEXT, 'a&b'],
          [lex.TokenType.EOF],
        ]);
      });

      it('should parse hexadecimal entities', () => {
        expect(tokenizeAndHumanizeParts('&#x41;&#X41;')).toEqual([
-          [lex.TokenType.TEXT, ''],
-          [lex.TokenType.ENCODED_ENTITY, 'A', '&#x41;'],
-          [lex.TokenType.TEXT, ''],
-          [lex.TokenType.ENCODED_ENTITY, 'A', '&#X41;'],
-          [lex.TokenType.TEXT, ''],
+          [lex.TokenType.TEXT, 'AA'],
          [lex.TokenType.EOF],
        ]);
      });

      it('should parse decimal entities', () => {
        expect(tokenizeAndHumanizeParts('&#65;')).toEqual([
-          [lex.TokenType.TEXT, ''],
-          [lex.TokenType.ENCODED_ENTITY, 'A', '&#65;'],
-          [lex.TokenType.TEXT, ''],
+          [lex.TokenType.TEXT, 'A'],
          [lex.TokenType.EOF],
        ]);
      });

      it('should store the locations', () => {
        expect(tokenizeAndHumanizeSourceSpans('a&amp;b')).toEqual([
-          [lex.TokenType.TEXT, 'a'],
-          [lex.TokenType.ENCODED_ENTITY, '&amp;'],
-          [lex.TokenType.TEXT, 'b'],
+          [lex.TokenType.TEXT, 'a&amp;b'],
          [lex.TokenType.EOF, ''],
        ]);
      });

      it('should report malformed/unknown entities', () => {
        expect(tokenizeAndHumanizeErrors('&tbo;')).toEqual([[
-          lex.TokenType.ENCODED_ENTITY,
+          lex.TokenType.TEXT,
          'Unknown entity "tbo" - use the "&#<decimal>;" or  "&#x<hex>;" syntax', '0:0'
        ]]);
        expect(tokenizeAndHumanizeErrors('&#3sdf;')).toEqual([[
-          lex.TokenType.ENCODED_ENTITY,
+          lex.TokenType.TEXT,
          'Unable to parse entity "&#3s" - decimal character reference entities must end with ";"',
          '0:4'
        ]]);
        expect(tokenizeAndHumanizeErrors('&#xasdf;')).toEqual([[
-          lex.TokenType.ENCODED_ENTITY,
+          lex.TokenType.TEXT,
          'Unable to parse entity "&#xas" - hexadecimal character reference entities must end with ";"',
          '0:5'
        ]]);

        expect(tokenizeAndHumanizeErrors('&#xABC')).toEqual([
-          [lex.TokenType.ENCODED_ENTITY, 'Unexpected character "EOF"', '0:6']
+          [lex.TokenType.TEXT, 'Unexpected character "EOF"', '0:6']
        ]);
      });
    });
@ -657,16 +643,12 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u

      it('should parse entities', () => {
        expect(tokenizeAndHumanizeParts('a&amp;b')).toEqual([
-          [lex.TokenType.TEXT, 'a'],
-          [lex.TokenType.ENCODED_ENTITY, '&', '&amp;'],
-          [lex.TokenType.TEXT, 'b'],
+          [lex.TokenType.TEXT, 'a&b'],
          [lex.TokenType.EOF],
        ]);

        expect(tokenizeAndHumanizeSourceSpans('a&amp;b')).toEqual([
-          [lex.TokenType.TEXT, 'a'],
-          [lex.TokenType.ENCODED_ENTITY, '&amp;'],
-          [lex.TokenType.TEXT, 'b'],
+          [lex.TokenType.TEXT, 'a&amp;b'],
          [lex.TokenType.EOF, ''],
        ]);
      });
@ -912,9 +894,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
        expect(tokenizeAndHumanizeParts(`<title>&amp;</title>`)).toEqual([
          [lex.TokenType.TAG_OPEN_START, '', 'title'],
          [lex.TokenType.TAG_OPEN_END],
-          [lex.TokenType.ESCAPABLE_RAW_TEXT, ''],
-          [lex.TokenType.ENCODED_ENTITY, '&', '&amp;'],
-          [lex.TokenType.ESCAPABLE_RAW_TEXT, ''],
+          [lex.TokenType.ESCAPABLE_RAW_TEXT, '&'],
          [lex.TokenType.TAG_CLOSE, '', 'title'],
          [lex.TokenType.EOF],
        ]);