From 942b24d5ea5d36ad4e53ed435bda35a6ae6876c9 Mon Sep 17 00:00:00 2001
From: Pete Bacon Darwin <pete@bacondarwin.com>
Date: Fri, 14 May 2021 18:53:17 +0100
Subject: [PATCH] refactor(compiler): support encoded entity tokens when lexing
 markup (#42062)

The lexer now splits encoded entity tokens out from text and attribute value tokens.

Previously encoded entities would be decoded and the decoded value would be
included as part of the text token of the surrounding text. Now the entities
have their own tokens. There are two scenarios: text and attribute values.

Previously the contents of `<div>Hello &amp; goodbye</div>` would be a single
TEXT token. Now it will be three tokens:

```
TEXT: "Hello "
ENCODED_ENTITY: "&", "&amp;"
TEXT: " goodbye"
```

Previously the attribute value in `<div title="Hello &amp; goodbye">` would be
a single text token. Now it will be three tokens:

```
ATTR_VALUE_TEXT: "Hello "
ENCODED_ENTITY: "&", "&amp;"
ATTR_VALUE_TEXT: " goodbye"
```

- ENCODED_ENTITY tokens have two parts: "decoded" and "encoded".
- ENCODED_ENTITY tokens are always preceded and followed by either TEXT tokens
  or ATTR_VALUE_TEXT tokens, depending upon the context, even if they represent
  an empty string.

The HTML parser has been modified to recombine these tokens to allow this
refactoring to have limited effect in this commit. Further refactorings
to use these new tokens will follow in subsequent commits.

PR Close #42062
---
 packages/compiler/src/ml_parser/lexer.ts      | 74 +++++++++++--------
 packages/compiler/src/ml_parser/parser.ts     | 38 +++++-----
 .../compiler/test/ml_parser/lexer_spec.ts     | 44 ++++++++---
 3 files changed, 98 insertions(+), 58 deletions(-)
diff --git a/packages/compiler/src/ml_parser/lexer.ts b/packages/compiler/src/ml_parser/lexer.ts
index d7306a2389..13d3a6bfba 100644
--- a/packages/compiler/src/ml_parser/lexer.ts
+++ b/packages/compiler/src/ml_parser/lexer.ts
@@ -23,6 +23,7 @@ export enum TokenType {
   ESCAPABLE_RAW_TEXT,
   RAW_TEXT,
   INTERPOLATION,
+  ENCODED_ENTITY,
   COMMENT_START,
   COMMENT_END,
   CDATA_START,
@@ -395,19 +396,16 @@ class _Tokenizer {
     }
   }
 
-  private _readChar(decodeEntities: boolean): string {
-    if (decodeEntities && this._cursor.peek() === chars.$AMPERSAND) {
-      return this._decodeEntity();
-    } else {
-      // Don't rely upon reading directly from `_input` as the actual char value
-      // may have been generated from an escape sequence.
-      const char = String.fromCodePoint(this._cursor.peek());
-      this._cursor.advance();
-      return char;
-    }
+  private _readChar(): string {
+    // Don't rely upon reading directly from `_input` as the actual char value
+    // may have been generated from an escape sequence.
+    const char = String.fromCodePoint(this._cursor.peek());
+    this._cursor.advance();
+    return char;
   }
 
-  private _decodeEntity(): string {
+  private _consumeEntity(textTokenType: TokenType): void {
+    this._beginToken(TokenType.ENCODED_ENTITY);
     const start = this._cursor.clone();
     this._cursor.advance();
     if (this._attemptCharCode(chars.$HASH)) {
@@ -427,7 +425,7 @@ class _Tokenizer {
       this._cursor.advance();
       try {
         const charCode = parseInt(strNum, isHex ? 16 : 10);
-        return String.fromCharCode(charCode);
+        this._endToken([String.fromCharCode(charCode), this._cursor.getChars(start)]);
       } catch {
         throw this._createError(
             _unknownEntityErrorMsg(this._cursor.getChars(start)), this._cursor.getSpan());
@@ -436,21 +434,25 @@ class _Tokenizer {
       const nameStart = this._cursor.clone();
       this._attemptCharCodeUntilFn(isNamedEntityEnd);
       if (this._cursor.peek() != chars.$SEMICOLON) {
+        // No semicolon was found so abort the encoded entity token that was in progress, and treat
+        // this as a text token
+        this._beginToken(textTokenType, start);
         this._cursor = nameStart;
-        return '&';
+        this._endToken(['&']);
+      } else {
+        const name = this._cursor.getChars(nameStart);
+        this._cursor.advance();
+        const char = NAMED_ENTITIES[name];
+        if (!char) {
+          throw this._createError(_unknownEntityErrorMsg(name), this._cursor.getSpan(start));
+        }
+        this._endToken([char, `&${name};`]);
       }
-      const name = this._cursor.getChars(nameStart);
-      this._cursor.advance();
-      const char = NAMED_ENTITIES[name];
-      if (!char) {
-        throw this._createError(_unknownEntityErrorMsg(name), this._cursor.getSpan(start));
-      }
-      return char;
     }
   }
 
-  private _consumeRawText(decodeEntities: boolean, endMarkerPredicate: () => boolean): Token {
-    this._beginToken(decodeEntities ? TokenType.ESCAPABLE_RAW_TEXT : TokenType.RAW_TEXT);
+  private _consumeRawText(consumeEntities: boolean, endMarkerPredicate: () => boolean): void {
+    this._beginToken(consumeEntities ? TokenType.ESCAPABLE_RAW_TEXT : TokenType.RAW_TEXT);
     const parts: string[] = [];
     while (true) {
       const tagCloseStart = this._cursor.clone();
@@ -459,9 +461,16 @@ class _Tokenizer {
       if (foundEndMarker) {
         break;
       }
-      parts.push(this._readChar(decodeEntities));
+      if (consumeEntities && this._cursor.peek() === chars.$AMPERSAND) {
+        this._endToken([this._processCarriageReturns(parts.join(''))]);
+        parts.length = 0;
+        this._consumeEntity(TokenType.ESCAPABLE_RAW_TEXT);
+        this._beginToken(TokenType.ESCAPABLE_RAW_TEXT);
+      } else {
+        parts.push(this._readChar());
+      }
     }
-    return this._endToken([this._processCarriageReturns(parts.join(''))]);
+    this._endToken([this._processCarriageReturns(parts.join(''))]);
   }
 
   private _consumeComment(start: CharacterCursor) {
@@ -563,8 +572,8 @@ class _Tokenizer {
     }
   }
 
-  private _consumeRawTextWithTagClose(prefix: string, tagName: string, decodeEntities: boolean) {
-    this._consumeRawText(decodeEntities, () => {
+  private _consumeRawTextWithTagClose(prefix: string, tagName: string, consumeEntities: boolean) {
+    this._consumeRawText(consumeEntities, () => {
       if (!this._attemptCharCode(chars.$LT)) return false;
       if (!this._attemptCharCode(chars.$SLASH)) return false;
       this._attemptCharCodeUntilFn(isNotWhitespace);
@@ -712,11 +721,16 @@ class _Tokenizer {
       const current = this._cursor.clone();
       if (this._interpolationConfig && this._attemptStr(this._interpolationConfig.start)) {
         this._endToken([this._processCarriageReturns(parts.join(''))], current);
-        this._consumeInterpolation(interpolationTokenType, current);
         parts.length = 0;
+        this._consumeInterpolation(interpolationTokenType, current);
+        this._beginToken(textTokenType);
+      } else if (this._cursor.peek() === chars.$AMPERSAND) {
+        this._endToken([this._processCarriageReturns(parts.join(''))]);
+        parts.length = 0;
+        this._consumeEntity(textTokenType);
         this._beginToken(textTokenType);
       } else {
-        parts.push(this._readChar(true));
+        parts.push(this._readChar());
       }
     }
 
@@ -895,7 +909,9 @@ function mergeTextTokens(srcTokens: Token[]): Token[] {
   let lastDstToken: Token|undefined = undefined;
   for (let i = 0; i < srcTokens.length; i++) {
     const token = srcTokens[i];
-    if (lastDstToken && lastDstToken.type == TokenType.TEXT && token.type == TokenType.TEXT) {
+    if ((lastDstToken && lastDstToken.type == TokenType.TEXT && token.type == TokenType.TEXT) ||
+        (lastDstToken && lastDstToken.type == TokenType.ATTR_VALUE_TEXT &&
+         token.type == TokenType.ATTR_VALUE_TEXT)) {
       lastDstToken.parts[0]! += token.parts[0];
       lastDstToken.sourceSpan.end = token.sourceSpan.end;
     } else {
diff --git a/packages/compiler/src/ml_parser/parser.ts b/packages/compiler/src/ml_parser/parser.ts
index 9ac0b944e4..4d5f18d440 100644
--- a/packages/compiler/src/ml_parser/parser.ts
+++ b/packages/compiler/src/ml_parser/parser.ts
@@ -226,20 +226,21 @@ class _TreeBuilder {
       }
     }
 
-    // For now recombine text and interpolation tokens
-    if (this._peek.type === lex.TokenType.INTERPOLATION) {
-      while (this._peek.type === lex.TokenType.INTERPOLATION ||
-             this._peek.type === lex.TokenType.TEXT) {
-        token = this._advance();
-        if (token.type === lex.TokenType.INTERPOLATION) {
-          // For backward compatibility we decode HTML entities that appear in interpolation
-          // expressions. This is arguably a bug, but it could be a considerable breaking change to
-          // fix it. It should be addressed in a larger project to refactor the entire parser/lexer
-          // chain after View Engine has been removed.
-          text += token.parts.join('').replace(/&([^;]+);/g, decodeEntity);
-        } else {
-          text += token.parts.join('');
-        }
+    // For now recombine text, interpolation and entity tokens
+    while (this._peek.type === lex.TokenType.INTERPOLATION ||
+           this._peek.type === lex.TokenType.TEXT ||
+           this._peek.type === lex.TokenType.ENCODED_ENTITY) {
+      token = this._advance();
+      if (token.type === lex.TokenType.INTERPOLATION) {
+        // For backward compatibility we decode HTML entities that appear in interpolation
+        // expressions. This is arguably a bug, but it could be a considerable breaking change to
+        // fix it. It should be addressed in a larger project to refactor the entire parser/lexer
+        // chain after View Engine has been removed.
+        text += token.parts.join('').replace(/&([^;]+);/g, decodeEntity);
+      } else if (token.type === lex.TokenType.ENCODED_ENTITY) {
+        text += token.parts[0];
+      } else {
+        text += token.parts.join('');
       }
     }
 
@@ -369,16 +370,17 @@ class _TreeBuilder {
       this._advance();
     }
 
-    // Consume the value
+    // Consume the attribute value
     let value = '';
     let valueStartSpan: ParseSourceSpan|undefined = undefined;
     let valueEnd: ParseLocation|undefined = undefined;
     if (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT) {
       valueStartSpan = this._peek.sourceSpan;
       valueEnd = this._peek.sourceSpan.end;
-      // For now we are recombining text and interpolation tokens
+      // For now recombine text, interpolation and entity tokens
       while (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT ||
-             this._peek.type === lex.TokenType.ATTR_VALUE_INTERPOLATION) {
+             this._peek.type === lex.TokenType.ATTR_VALUE_INTERPOLATION ||
+             this._peek.type === lex.TokenType.ENCODED_ENTITY) {
         let valueToken = this._advance();
         if (valueToken.type === lex.TokenType.ATTR_VALUE_INTERPOLATION) {
           // For backward compatibility we decode HTML entities that appear in interpolation
@@ -386,6 +388,8 @@ class _TreeBuilder {
           // fix it. It should be addressed in a larger project to refactor the entire parser/lexer
           // chain after View Engine has been removed.
           value += valueToken.parts.join('').replace(/&([^;]+);/g, decodeEntity);
+        } else if (valueToken.type === lex.TokenType.ENCODED_ENTITY) {
+          value += valueToken.parts[0];
         } else {
           value += valueToken.parts.join('');
         }
diff --git a/packages/compiler/test/ml_parser/lexer_spec.ts b/packages/compiler/test/ml_parser/lexer_spec.ts
index 835d59970d..bc8559221b 100644
--- a/packages/compiler/test/ml_parser/lexer_spec.ts
+++ b/packages/compiler/test/ml_parser/lexer_spec.ts
@@ -407,7 +407,11 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
           [lex.TokenType.TAG_OPEN_START, '', 't'],
           [lex.TokenType.ATTR_NAME, '', 'a'],
           [lex.TokenType.ATTR_QUOTE, '"'],
-          [lex.TokenType.ATTR_VALUE_TEXT, 'AA'],
+          [lex.TokenType.ATTR_VALUE_TEXT, ''],
+          [lex.TokenType.ENCODED_ENTITY, 'A', '&#65;'],
+          [lex.TokenType.ATTR_VALUE_TEXT, ''],
+          [lex.TokenType.ENCODED_ENTITY, 'A', '&#x41;'],
+          [lex.TokenType.ATTR_VALUE_TEXT, ''],
           [lex.TokenType.ATTR_QUOTE, '"'],
           [lex.TokenType.TAG_OPEN_END],
           [lex.TokenType.EOF],
@@ -522,50 +526,60 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
     describe('entities', () => {
       it('should parse named entities', () => {
         expect(tokenizeAndHumanizeParts('a&amp;b')).toEqual([
-          [lex.TokenType.TEXT, 'a&b'],
+          [lex.TokenType.TEXT, 'a'],
+          [lex.TokenType.ENCODED_ENTITY, '&', '&amp;'],
+          [lex.TokenType.TEXT, 'b'],
           [lex.TokenType.EOF],
         ]);
       });
 
       it('should parse hexadecimal entities', () => {
         expect(tokenizeAndHumanizeParts('&#x41;&#X41;')).toEqual([
-          [lex.TokenType.TEXT, 'AA'],
+          [lex.TokenType.TEXT, ''],
+          [lex.TokenType.ENCODED_ENTITY, 'A', '&#x41;'],
+          [lex.TokenType.TEXT, ''],
+          [lex.TokenType.ENCODED_ENTITY, 'A', '&#X41;'],
+          [lex.TokenType.TEXT, ''],
           [lex.TokenType.EOF],
         ]);
       });
 
       it('should parse decimal entities', () => {
         expect(tokenizeAndHumanizeParts('&#65;')).toEqual([
-          [lex.TokenType.TEXT, 'A'],
+          [lex.TokenType.TEXT, ''],
+          [lex.TokenType.ENCODED_ENTITY, 'A', '&#65;'],
+          [lex.TokenType.TEXT, ''],
           [lex.TokenType.EOF],
         ]);
       });
 
       it('should store the locations', () => {
         expect(tokenizeAndHumanizeSourceSpans('a&amp;b')).toEqual([
-          [lex.TokenType.TEXT, 'a&amp;b'],
+          [lex.TokenType.TEXT, 'a'],
+          [lex.TokenType.ENCODED_ENTITY, '&amp;'],
+          [lex.TokenType.TEXT, 'b'],
           [lex.TokenType.EOF, ''],
         ]);
       });
 
       it('should report malformed/unknown entities', () => {
         expect(tokenizeAndHumanizeErrors('&tbo;')).toEqual([[
-          lex.TokenType.TEXT,
+          lex.TokenType.ENCODED_ENTITY,
           'Unknown entity "tbo" - use the "&#<decimal>;" or  "&#x<hex>;" syntax', '0:0'
         ]]);
         expect(tokenizeAndHumanizeErrors('sdf;')).toEqual([[
-          lex.TokenType.TEXT,
+          lex.TokenType.ENCODED_ENTITY,
           'Unable to parse entity "s" - decimal character reference entities must end with ";"',
           '0:4'
         ]]);
         expect(tokenizeAndHumanizeErrors('&#xasdf;')).toEqual([[
-          lex.TokenType.TEXT,
+          lex.TokenType.ENCODED_ENTITY,
           'Unable to parse entity "&#xas" - hexadecimal character reference entities must end with ";"',
           '0:5'
         ]]);
 
         expect(tokenizeAndHumanizeErrors('&#xABC')).toEqual([
-          [lex.TokenType.TEXT, 'Unexpected character "EOF"', '0:6']
+          [lex.TokenType.ENCODED_ENTITY, 'Unexpected character "EOF"', '0:6']
         ]);
       });
     });
@@ -643,12 +657,16 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
 
       it('should parse entities', () => {
         expect(tokenizeAndHumanizeParts('a&amp;b')).toEqual([
-          [lex.TokenType.TEXT, 'a&b'],
+          [lex.TokenType.TEXT, 'a'],
+          [lex.TokenType.ENCODED_ENTITY, '&', '&amp;'],
+          [lex.TokenType.TEXT, 'b'],
           [lex.TokenType.EOF],
         ]);
 
         expect(tokenizeAndHumanizeSourceSpans('a&amp;b')).toEqual([
-          [lex.TokenType.TEXT, 'a&amp;b'],
+          [lex.TokenType.TEXT, 'a'],
+          [lex.TokenType.ENCODED_ENTITY, '&amp;'],
+          [lex.TokenType.TEXT, 'b'],
           [lex.TokenType.EOF, ''],
         ]);
       });
@@ -894,7 +912,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
         expect(tokenizeAndHumanizeParts(`<title>&amp;</title>`)).toEqual([
           [lex.TokenType.TAG_OPEN_START, '', 'title'],
           [lex.TokenType.TAG_OPEN_END],
-          [lex.TokenType.ESCAPABLE_RAW_TEXT, '&'],
+          [lex.TokenType.ESCAPABLE_RAW_TEXT, ''],
+          [lex.TokenType.ENCODED_ENTITY, '&', '&amp;'],
+          [lex.TokenType.ESCAPABLE_RAW_TEXT, ''],
           [lex.TokenType.TAG_CLOSE, '', 'title'],
           [lex.TokenType.EOF],
         ]);