From 8d8ab4775c1ff197ad101cbd33690bc8620a3c56 Mon Sep 17 00:00:00 2001
From: atscott <atscott01@gmail.com>
Date: Tue, 3 Aug 2021 14:49:05 -0700
Subject: [PATCH] Revert "refactor(compiler): support encoded entity tokens
 when lexing markup (#42062)" (#43033)

This reverts commit 942b24d5ea5d36ad4e53ed435bda35a6ae6876c9.

PR Close #43033
---
 packages/compiler/src/ml_parser/lexer.ts      | 72 ++++++++-----------
 packages/compiler/src/ml_parser/parser.ts     | 38 +++++-----
 .../compiler/test/ml_parser/lexer_spec.ts     | 44 ++++--------
 3 files changed, 57 insertions(+), 97 deletions(-)

diff --git a/packages/compiler/src/ml_parser/lexer.ts b/packages/compiler/src/ml_parser/lexer.ts
index 13d3a6bfba..d7306a2389 100644
--- a/packages/compiler/src/ml_parser/lexer.ts
+++ b/packages/compiler/src/ml_parser/lexer.ts
@@ -23,7 +23,6 @@ export enum TokenType {
   ESCAPABLE_RAW_TEXT,
   RAW_TEXT,
   INTERPOLATION,
-  ENCODED_ENTITY,
   COMMENT_START,
   COMMENT_END,
   CDATA_START,
@@ -396,16 +395,19 @@ class _Tokenizer {
     }
   }
 
-  private _readChar(): string {
-    // Don't rely upon reading directly from `_input` as the actual char value
-    // may have been generated from an escape sequence.
-    const char = String.fromCodePoint(this._cursor.peek());
-    this._cursor.advance();
-    return char;
+  private _readChar(decodeEntities: boolean): string {
+    if (decodeEntities && this._cursor.peek() === chars.$AMPERSAND) {
+      return this._decodeEntity();
+    } else {
+      // Don't rely upon reading directly from `_input` as the actual char value
+      // may have been generated from an escape sequence.
+      const char = String.fromCodePoint(this._cursor.peek());
+      this._cursor.advance();
+      return char;
+    }
   }
 
-  private _consumeEntity(textTokenType: TokenType): void {
-    this._beginToken(TokenType.ENCODED_ENTITY);
+  private _decodeEntity(): string {
     const start = this._cursor.clone();
     this._cursor.advance();
     if (this._attemptCharCode(chars.$HASH)) {
@@ -425,7 +427,7 @@ class _Tokenizer {
       this._cursor.advance();
       try {
         const charCode = parseInt(strNum, isHex ? 16 : 10);
-        this._endToken([String.fromCharCode(charCode), this._cursor.getChars(start)]);
+        return String.fromCharCode(charCode);
       } catch {
         throw this._createError(
             _unknownEntityErrorMsg(this._cursor.getChars(start)), this._cursor.getSpan());
@@ -434,25 +436,21 @@ class _Tokenizer {
       const nameStart = this._cursor.clone();
       this._attemptCharCodeUntilFn(isNamedEntityEnd);
       if (this._cursor.peek() != chars.$SEMICOLON) {
-        // No semicolon was found so abort the encoded entity token that was in progress, and treat
-        // this as a text token
-        this._beginToken(textTokenType, start);
         this._cursor = nameStart;
-        this._endToken(['&']);
-      } else {
-        const name = this._cursor.getChars(nameStart);
-        this._cursor.advance();
-        const char = NAMED_ENTITIES[name];
-        if (!char) {
-          throw this._createError(_unknownEntityErrorMsg(name), this._cursor.getSpan(start));
-        }
-        this._endToken([char, `&${name};`]);
+        return '&';
       }
+      const name = this._cursor.getChars(nameStart);
+      this._cursor.advance();
+      const char = NAMED_ENTITIES[name];
+      if (!char) {
+        throw this._createError(_unknownEntityErrorMsg(name), this._cursor.getSpan(start));
+      }
+      return char;
     }
   }
 
-  private _consumeRawText(consumeEntities: boolean, endMarkerPredicate: () => boolean): void {
-    this._beginToken(consumeEntities ? TokenType.ESCAPABLE_RAW_TEXT : TokenType.RAW_TEXT);
+  private _consumeRawText(decodeEntities: boolean, endMarkerPredicate: () => boolean): Token {
+    this._beginToken(decodeEntities ? TokenType.ESCAPABLE_RAW_TEXT : TokenType.RAW_TEXT);
     const parts: string[] = [];
     while (true) {
       const tagCloseStart = this._cursor.clone();
@@ -461,16 +459,9 @@ class _Tokenizer {
       if (foundEndMarker) {
         break;
       }
-      if (consumeEntities && this._cursor.peek() === chars.$AMPERSAND) {
-        this._endToken([this._processCarriageReturns(parts.join(''))]);
-        parts.length = 0;
-        this._consumeEntity(TokenType.ESCAPABLE_RAW_TEXT);
-        this._beginToken(TokenType.ESCAPABLE_RAW_TEXT);
-      } else {
-        parts.push(this._readChar());
-      }
+      parts.push(this._readChar(decodeEntities));
     }
-    this._endToken([this._processCarriageReturns(parts.join(''))]);
+    return this._endToken([this._processCarriageReturns(parts.join(''))]);
   }
 
   private _consumeComment(start: CharacterCursor) {
@@ -572,8 +563,8 @@ class _Tokenizer {
     }
   }
 
-  private _consumeRawTextWithTagClose(prefix: string, tagName: string, consumeEntities: boolean) {
-    this._consumeRawText(consumeEntities, () => {
+  private _consumeRawTextWithTagClose(prefix: string, tagName: string, decodeEntities: boolean) {
+    this._consumeRawText(decodeEntities, () => {
       if (!this._attemptCharCode(chars.$LT)) return false;
       if (!this._attemptCharCode(chars.$SLASH)) return false;
       this._attemptCharCodeUntilFn(isNotWhitespace);
@@ -721,16 +712,11 @@ class _Tokenizer {
       const current = this._cursor.clone();
       if (this._interpolationConfig && this._attemptStr(this._interpolationConfig.start)) {
         this._endToken([this._processCarriageReturns(parts.join(''))], current);
-        parts.length = 0;
         this._consumeInterpolation(interpolationTokenType, current);
-        this._beginToken(textTokenType);
-      } else if (this._cursor.peek() === chars.$AMPERSAND) {
-        this._endToken([this._processCarriageReturns(parts.join(''))]);
         parts.length = 0;
-        this._consumeEntity(textTokenType);
         this._beginToken(textTokenType);
       } else {
-        parts.push(this._readChar());
+        parts.push(this._readChar(true));
       }
     }
 
@@ -909,9 +895,7 @@ function mergeTextTokens(srcTokens: Token[]): Token[] {
   let lastDstToken: Token|undefined = undefined;
   for (let i = 0; i < srcTokens.length; i++) {
     const token = srcTokens[i];
-    if ((lastDstToken && lastDstToken.type == TokenType.TEXT && token.type == TokenType.TEXT) ||
-        (lastDstToken && lastDstToken.type == TokenType.ATTR_VALUE_TEXT &&
-         token.type == TokenType.ATTR_VALUE_TEXT)) {
+    if (lastDstToken && lastDstToken.type == TokenType.TEXT && token.type == TokenType.TEXT) {
       lastDstToken.parts[0]! += token.parts[0];
       lastDstToken.sourceSpan.end = token.sourceSpan.end;
     } else {
diff --git a/packages/compiler/src/ml_parser/parser.ts b/packages/compiler/src/ml_parser/parser.ts
index 4d5f18d440..9ac0b944e4 100644
--- a/packages/compiler/src/ml_parser/parser.ts
+++ b/packages/compiler/src/ml_parser/parser.ts
@@ -226,21 +226,20 @@ class _TreeBuilder {
       }
     }
 
-    // For now recombine text, interpolation and entity tokens
-    while (this._peek.type === lex.TokenType.INTERPOLATION ||
-           this._peek.type === lex.TokenType.TEXT ||
-           this._peek.type === lex.TokenType.ENCODED_ENTITY) {
-      token = this._advance();
-      if (token.type === lex.TokenType.INTERPOLATION) {
-        // For backward compatibility we decode HTML entities that appear in interpolation
-        // expressions. This is arguably a bug, but it could be a considerable breaking change to
-        // fix it. It should be addressed in a larger project to refactor the entire parser/lexer
-        // chain after View Engine has been removed.
-        text += token.parts.join('').replace(/&([^;]+);/g, decodeEntity);
-      } else if (token.type === lex.TokenType.ENCODED_ENTITY) {
-        text += token.parts[0];
-      } else {
-        text += token.parts.join('');
+    // For now recombine text and interpolation tokens
+    if (this._peek.type === lex.TokenType.INTERPOLATION) {
+      while (this._peek.type === lex.TokenType.INTERPOLATION ||
+             this._peek.type === lex.TokenType.TEXT) {
+        token = this._advance();
+        if (token.type === lex.TokenType.INTERPOLATION) {
+          // For backward compatibility we decode HTML entities that appear in interpolation
+          // expressions. This is arguably a bug, but it could be a considerable breaking change to
+          // fix it. It should be addressed in a larger project to refactor the entire parser/lexer
+          // chain after View Engine has been removed.
+          text += token.parts.join('').replace(/&([^;]+);/g, decodeEntity);
+        } else {
+          text += token.parts.join('');
+        }
       }
     }
 
@@ -370,17 +369,16 @@ class _TreeBuilder {
       this._advance();
     }
 
-    // Consume the attribute value
+    // Consume the value
     let value = '';
     let valueStartSpan: ParseSourceSpan|undefined = undefined;
     let valueEnd: ParseLocation|undefined = undefined;
     if (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT) {
       valueStartSpan = this._peek.sourceSpan;
       valueEnd = this._peek.sourceSpan.end;
-      // For now recombine text, interpolation and entity tokens
+      // For now we are recombining text and interpolation tokens
       while (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT ||
-             this._peek.type === lex.TokenType.ATTR_VALUE_INTERPOLATION ||
-             this._peek.type === lex.TokenType.ENCODED_ENTITY) {
+             this._peek.type === lex.TokenType.ATTR_VALUE_INTERPOLATION) {
         let valueToken = this._advance();
         if (valueToken.type === lex.TokenType.ATTR_VALUE_INTERPOLATION) {
           // For backward compatibility we decode HTML entities that appear in interpolation
@@ -388,8 +386,6 @@ class _TreeBuilder {
           // fix it. It should be addressed in a larger project to refactor the entire parser/lexer
           // chain after View Engine has been removed.
           value += valueToken.parts.join('').replace(/&([^;]+);/g, decodeEntity);
-        } else if (valueToken.type === lex.TokenType.ENCODED_ENTITY) {
-          value += valueToken.parts[0];
         } else {
           value += valueToken.parts.join('');
         }
diff --git a/packages/compiler/test/ml_parser/lexer_spec.ts b/packages/compiler/test/ml_parser/lexer_spec.ts
index bc8559221b..835d59970d 100644
--- a/packages/compiler/test/ml_parser/lexer_spec.ts
+++ b/packages/compiler/test/ml_parser/lexer_spec.ts
@@ -407,11 +407,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
           [lex.TokenType.TAG_OPEN_START, '', 't'],
           [lex.TokenType.ATTR_NAME, '', 'a'],
           [lex.TokenType.ATTR_QUOTE, '"'],
-          [lex.TokenType.ATTR_VALUE_TEXT, ''],
-          [lex.TokenType.ENCODED_ENTITY, 'A', '&#65;'],
-          [lex.TokenType.ATTR_VALUE_TEXT, ''],
-          [lex.TokenType.ENCODED_ENTITY, 'A', '&#x41;'],
-          [lex.TokenType.ATTR_VALUE_TEXT, ''],
+          [lex.TokenType.ATTR_VALUE_TEXT, 'AA'],
           [lex.TokenType.ATTR_QUOTE, '"'],
           [lex.TokenType.TAG_OPEN_END],
           [lex.TokenType.EOF],
@@ -526,60 +522,50 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
     describe('entities', () => {
       it('should parse named entities', () => {
         expect(tokenizeAndHumanizeParts('a&amp;b')).toEqual([
-          [lex.TokenType.TEXT, 'a'],
-          [lex.TokenType.ENCODED_ENTITY, '&', '&amp;'],
-          [lex.TokenType.TEXT, 'b'],
+          [lex.TokenType.TEXT, 'a&b'],
           [lex.TokenType.EOF],
         ]);
       });
 
       it('should parse hexadecimal entities', () => {
         expect(tokenizeAndHumanizeParts('&#x41;&#X41;')).toEqual([
-          [lex.TokenType.TEXT, ''],
-          [lex.TokenType.ENCODED_ENTITY, 'A', '&#x41;'],
-          [lex.TokenType.TEXT, ''],
-          [lex.TokenType.ENCODED_ENTITY, 'A', '&#X41;'],
-          [lex.TokenType.TEXT, ''],
+          [lex.TokenType.TEXT, 'AA'],
           [lex.TokenType.EOF],
         ]);
       });
 
       it('should parse decimal entities', () => {
         expect(tokenizeAndHumanizeParts('&#65;')).toEqual([
-          [lex.TokenType.TEXT, ''],
-          [lex.TokenType.ENCODED_ENTITY, 'A', '&#65;'],
-          [lex.TokenType.TEXT, ''],
+          [lex.TokenType.TEXT, 'A'],
           [lex.TokenType.EOF],
         ]);
       });
 
       it('should store the locations', () => {
         expect(tokenizeAndHumanizeSourceSpans('a&amp;b')).toEqual([
-          [lex.TokenType.TEXT, 'a'],
-          [lex.TokenType.ENCODED_ENTITY, '&amp;'],
-          [lex.TokenType.TEXT, 'b'],
+          [lex.TokenType.TEXT, 'a&amp;b'],
           [lex.TokenType.EOF, ''],
         ]);
       });
 
       it('should report malformed/unknown entities', () => {
         expect(tokenizeAndHumanizeErrors('&tbo;')).toEqual([[
-          lex.TokenType.ENCODED_ENTITY,
+          lex.TokenType.TEXT,
           'Unknown entity "tbo" - use the "&#<decimal>;" or  "&#x<hex>;" syntax', '0:0'
         ]]);
         expect(tokenizeAndHumanizeErrors('sdf;')).toEqual([[
-          lex.TokenType.ENCODED_ENTITY,
+          lex.TokenType.TEXT,
           'Unable to parse entity "s" - decimal character reference entities must end with ";"',
           '0:4'
         ]]);
         expect(tokenizeAndHumanizeErrors('&#xasdf;')).toEqual([[
-          lex.TokenType.ENCODED_ENTITY,
+          lex.TokenType.TEXT,
           'Unable to parse entity "&#xas" - hexadecimal character reference entities must end with ";"',
           '0:5'
         ]]);
 
         expect(tokenizeAndHumanizeErrors('&#xABC')).toEqual([
-          [lex.TokenType.ENCODED_ENTITY, 'Unexpected character "EOF"', '0:6']
+          [lex.TokenType.TEXT, 'Unexpected character "EOF"', '0:6']
         ]);
       });
     });
@@ -657,16 +643,12 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
 
       it('should parse entities', () => {
         expect(tokenizeAndHumanizeParts('a&amp;b')).toEqual([
-          [lex.TokenType.TEXT, 'a'],
-          [lex.TokenType.ENCODED_ENTITY, '&', '&amp;'],
-          [lex.TokenType.TEXT, 'b'],
+          [lex.TokenType.TEXT, 'a&b'],
           [lex.TokenType.EOF],
         ]);
 
         expect(tokenizeAndHumanizeSourceSpans('a&amp;b')).toEqual([
-          [lex.TokenType.TEXT, 'a'],
-          [lex.TokenType.ENCODED_ENTITY, '&amp;'],
-          [lex.TokenType.TEXT, 'b'],
+          [lex.TokenType.TEXT, 'a&amp;b'],
           [lex.TokenType.EOF, ''],
         ]);
       });
@@ -912,9 +894,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
         expect(tokenizeAndHumanizeParts(`<title>&amp;</title>`)).toEqual([
           [lex.TokenType.TAG_OPEN_START, '', 'title'],
           [lex.TokenType.TAG_OPEN_END],
-          [lex.TokenType.ESCAPABLE_RAW_TEXT, ''],
-          [lex.TokenType.ENCODED_ENTITY, '&', '&amp;'],
-          [lex.TokenType.ESCAPABLE_RAW_TEXT, ''],
+          [lex.TokenType.ESCAPABLE_RAW_TEXT, '&'],
           [lex.TokenType.TAG_CLOSE, '', 'title'],
           [lex.TokenType.EOF],
         ]);