refactor(compiler): support encoded entity tokens when lexing markup (#42062)

The lexer now splits encoded entity tokens out from text and attribute value tokens. Previously encoded entities would be decoded and the decoded value would be included as part of the text token of the surrounding text. Now the entities have their own tokens. There are two scenarios: text and attribute values. Previously the contents of `<div>Hello & goodbye</div>` would be a single TEXT token. Now it will be three tokens: ``` TEXT: "Hello " ENCODED_ENTITY: "&", "&" TEXT: " goodbye" ``` Previously the attribute value in `<div title="Hello & goodbye">` would be a single text token. Now it will be three tokens: ``` ATTR_VALUE_TEXT: "Hello " ENCODED_ENTITY: "&", "&" ATTR_VALUE_TEXT: " goodbye" ``` - ENCODED_ENTITY tokens have two parts: "decoded" and "encoded". - ENCODED_ENTITY tokens are always preceded and followed by either TEXT tokens or ATTR_VALUE_TEXT tokens, depending upon the context, even if they represent an empty string. The HTML parser has been modified to recombine these tokens to allow this refactoring to have limited effect in this commit. Further refactorings to use these new tokens will follow in subsequent commits. PR Close #42062
2021-05-14 18:53:17 +01:00 · 2021-05-14 18:53:17 +01:00 · 942b24d5ea
commit 942b24d5ea
parent c516e252fc
3 changed files with 98 additions and 58 deletions
--- a/packages/compiler/src/ml_parser/lexer.ts
+++ b/packages/compiler/src/ml_parser/lexer.ts
@ -23,6 +23,7 @@ export enum TokenType {
  ESCAPABLE_RAW_TEXT,
  RAW_TEXT,
  INTERPOLATION,
+  ENCODED_ENTITY,
  COMMENT_START,
  COMMENT_END,
  CDATA_START,
@ -395,19 +396,16 @@ class _Tokenizer {
    }
  }

-  private _readChar(decodeEntities: boolean): string {
-    if (decodeEntities && this._cursor.peek() === chars.$AMPERSAND) {
-      return this._decodeEntity();
-    } else {
-      // Don't rely upon reading directly from `_input` as the actual char value
-      // may have been generated from an escape sequence.
-      const char = String.fromCodePoint(this._cursor.peek());
-      this._cursor.advance();
-      return char;
-    }
+  private _readChar(): string {
+    // Don't rely upon reading directly from `_input` as the actual char value
+    // may have been generated from an escape sequence.
+    const char = String.fromCodePoint(this._cursor.peek());
+    this._cursor.advance();
+    return char;
  }

-  private _decodeEntity(): string {
+  private _consumeEntity(textTokenType: TokenType): void {
+    this._beginToken(TokenType.ENCODED_ENTITY);
    const start = this._cursor.clone();
    this._cursor.advance();
    if (this._attemptCharCode(chars.$HASH)) {
@ -427,7 +425,7 @@ class _Tokenizer {
      this._cursor.advance();
      try {
        const charCode = parseInt(strNum, isHex ? 16 : 10);
-        return String.fromCharCode(charCode);
+        this._endToken([String.fromCharCode(charCode), this._cursor.getChars(start)]);
      } catch {
        throw this._createError(
            _unknownEntityErrorMsg(this._cursor.getChars(start)), this._cursor.getSpan());
@ -436,21 +434,25 @@ class _Tokenizer {
      const nameStart = this._cursor.clone();
      this._attemptCharCodeUntilFn(isNamedEntityEnd);
      if (this._cursor.peek() != chars.$SEMICOLON) {
+        // No semicolon was found so abort the encoded entity token that was in progress, and treat
+        // this as a text token
+        this._beginToken(textTokenType, start);
        this._cursor = nameStart;
-        return '&';
+        this._endToken(['&']);
+      } else {
+        const name = this._cursor.getChars(nameStart);
+        this._cursor.advance();
+        const char = NAMED_ENTITIES[name];
+        if (!char) {
+          throw this._createError(_unknownEntityErrorMsg(name), this._cursor.getSpan(start));
+        }
+        this._endToken([char, `&${name};`]);
      }
-      const name = this._cursor.getChars(nameStart);
-      this._cursor.advance();
-      const char = NAMED_ENTITIES[name];
-      if (!char) {
-        throw this._createError(_unknownEntityErrorMsg(name), this._cursor.getSpan(start));
-      }
-      return char;
    }
  }

-  private _consumeRawText(decodeEntities: boolean, endMarkerPredicate: () => boolean): Token {
-    this._beginToken(decodeEntities ? TokenType.ESCAPABLE_RAW_TEXT : TokenType.RAW_TEXT);
+  private _consumeRawText(consumeEntities: boolean, endMarkerPredicate: () => boolean): void {
+    this._beginToken(consumeEntities ? TokenType.ESCAPABLE_RAW_TEXT : TokenType.RAW_TEXT);
    const parts: string[] = [];
    while (true) {
      const tagCloseStart = this._cursor.clone();
@ -459,9 +461,16 @@ class _Tokenizer {
      if (foundEndMarker) {
        break;
      }
-      parts.push(this._readChar(decodeEntities));
+      if (consumeEntities && this._cursor.peek() === chars.$AMPERSAND) {
+        this._endToken([this._processCarriageReturns(parts.join(''))]);
+        parts.length = 0;
+        this._consumeEntity(TokenType.ESCAPABLE_RAW_TEXT);
+        this._beginToken(TokenType.ESCAPABLE_RAW_TEXT);
+      } else {
+        parts.push(this._readChar());
+      }
    }
-    return this._endToken([this._processCarriageReturns(parts.join(''))]);
+    this._endToken([this._processCarriageReturns(parts.join(''))]);
  }

  private _consumeComment(start: CharacterCursor) {
@ -563,8 +572,8 @@ class _Tokenizer {
    }
  }

-  private _consumeRawTextWithTagClose(prefix: string, tagName: string, decodeEntities: boolean) {
-    this._consumeRawText(decodeEntities, () => {
+  private _consumeRawTextWithTagClose(prefix: string, tagName: string, consumeEntities: boolean) {
+    this._consumeRawText(consumeEntities, () => {
      if (!this._attemptCharCode(chars.$LT)) return false;
      if (!this._attemptCharCode(chars.$SLASH)) return false;
      this._attemptCharCodeUntilFn(isNotWhitespace);
@ -712,11 +721,16 @@ class _Tokenizer {
      const current = this._cursor.clone();
      if (this._interpolationConfig && this._attemptStr(this._interpolationConfig.start)) {
        this._endToken([this._processCarriageReturns(parts.join(''))], current);
-        this._consumeInterpolation(interpolationTokenType, current);
        parts.length = 0;
+        this._consumeInterpolation(interpolationTokenType, current);
+        this._beginToken(textTokenType);
+      } else if (this._cursor.peek() === chars.$AMPERSAND) {
+        this._endToken([this._processCarriageReturns(parts.join(''))]);
+        parts.length = 0;
+        this._consumeEntity(textTokenType);
        this._beginToken(textTokenType);
      } else {
-        parts.push(this._readChar(true));
+        parts.push(this._readChar());
      }
    }

@ -895,7 +909,9 @@ function mergeTextTokens(srcTokens: Token[]): Token[] {
  let lastDstToken: Token|undefined = undefined;
  for (let i = 0; i < srcTokens.length; i++) {
    const token = srcTokens[i];
-    if (lastDstToken && lastDstToken.type == TokenType.TEXT && token.type == TokenType.TEXT) {
+    if ((lastDstToken && lastDstToken.type == TokenType.TEXT && token.type == TokenType.TEXT) ||
+        (lastDstToken && lastDstToken.type == TokenType.ATTR_VALUE_TEXT &&
+         token.type == TokenType.ATTR_VALUE_TEXT)) {
      lastDstToken.parts[0]! += token.parts[0];
      lastDstToken.sourceSpan.end = token.sourceSpan.end;
    } else {
--- a/packages/compiler/src/ml_parser/parser.ts
+++ b/packages/compiler/src/ml_parser/parser.ts
@ -226,20 +226,21 @@ class _TreeBuilder {
      }
    }

-    // For now recombine text and interpolation tokens
-    if (this._peek.type === lex.TokenType.INTERPOLATION) {
-      while (this._peek.type === lex.TokenType.INTERPOLATION ||
-             this._peek.type === lex.TokenType.TEXT) {
-        token = this._advance();
-        if (token.type === lex.TokenType.INTERPOLATION) {
-          // For backward compatibility we decode HTML entities that appear in interpolation
-          // expressions. This is arguably a bug, but it could be a considerable breaking change to
-          // fix it. It should be addressed in a larger project to refactor the entire parser/lexer
-          // chain after View Engine has been removed.
-          text += token.parts.join('').replace(/&([^;]+);/g, decodeEntity);
-        } else {
-          text += token.parts.join('');
-        }
+    // For now recombine text, interpolation and entity tokens
+    while (this._peek.type === lex.TokenType.INTERPOLATION ||
+           this._peek.type === lex.TokenType.TEXT ||
+           this._peek.type === lex.TokenType.ENCODED_ENTITY) {
+      token = this._advance();
+      if (token.type === lex.TokenType.INTERPOLATION) {
+        // For backward compatibility we decode HTML entities that appear in interpolation
+        // expressions. This is arguably a bug, but it could be a considerable breaking change to
+        // fix it. It should be addressed in a larger project to refactor the entire parser/lexer
+        // chain after View Engine has been removed.
+        text += token.parts.join('').replace(/&([^;]+);/g, decodeEntity);
+      } else if (token.type === lex.TokenType.ENCODED_ENTITY) {
+        text += token.parts[0];
+      } else {
+        text += token.parts.join('');
      }
    }

@ -369,16 +370,17 @@ class _TreeBuilder {
      this._advance();
    }

-    // Consume the value
+    // Consume the attribute value
    let value = '';
    let valueStartSpan: ParseSourceSpan|undefined = undefined;
    let valueEnd: ParseLocation|undefined = undefined;
    if (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT) {
      valueStartSpan = this._peek.sourceSpan;
      valueEnd = this._peek.sourceSpan.end;
-      // For now we are recombining text and interpolation tokens
+      // For now recombine text, interpolation and entity tokens
      while (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT ||
-             this._peek.type === lex.TokenType.ATTR_VALUE_INTERPOLATION) {
+             this._peek.type === lex.TokenType.ATTR_VALUE_INTERPOLATION ||
+             this._peek.type === lex.TokenType.ENCODED_ENTITY) {
        let valueToken = this._advance();
        if (valueToken.type === lex.TokenType.ATTR_VALUE_INTERPOLATION) {
          // For backward compatibility we decode HTML entities that appear in interpolation
@ -386,6 +388,8 @@ class _TreeBuilder {
          // fix it. It should be addressed in a larger project to refactor the entire parser/lexer
          // chain after View Engine has been removed.
          value += valueToken.parts.join('').replace(/&([^;]+);/g, decodeEntity);
+        } else if (valueToken.type === lex.TokenType.ENCODED_ENTITY) {
+          value += valueToken.parts[0];
        } else {
          value += valueToken.parts.join('');
        }
--- a/packages/compiler/test/ml_parser/lexer_spec.ts
+++ b/packages/compiler/test/ml_parser/lexer_spec.ts
@ -407,7 +407,11 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
          [lex.TokenType.TAG_OPEN_START, '', 't'],
          [lex.TokenType.ATTR_NAME, '', 'a'],
          [lex.TokenType.ATTR_QUOTE, '"'],
-          [lex.TokenType.ATTR_VALUE_TEXT, 'AA'],
+          [lex.TokenType.ATTR_VALUE_TEXT, ''],
+          [lex.TokenType.ENCODED_ENTITY, 'A', '&#65;'],
+          [lex.TokenType.ATTR_VALUE_TEXT, ''],
+          [lex.TokenType.ENCODED_ENTITY, 'A', '&#x41;'],
+          [lex.TokenType.ATTR_VALUE_TEXT, ''],
          [lex.TokenType.ATTR_QUOTE, '"'],
          [lex.TokenType.TAG_OPEN_END],
          [lex.TokenType.EOF],
@ -522,50 +526,60 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
    describe('entities', () => {
      it('should parse named entities', () => {
        expect(tokenizeAndHumanizeParts('a&amp;b')).toEqual([
-          [lex.TokenType.TEXT, 'a&b'],
+          [lex.TokenType.TEXT, 'a'],
+          [lex.TokenType.ENCODED_ENTITY, '&', '&amp;'],
+          [lex.TokenType.TEXT, 'b'],
          [lex.TokenType.EOF],
        ]);
      });

      it('should parse hexadecimal entities', () => {
        expect(tokenizeAndHumanizeParts('&#x41;&#X41;')).toEqual([
-          [lex.TokenType.TEXT, 'AA'],
+          [lex.TokenType.TEXT, ''],
+          [lex.TokenType.ENCODED_ENTITY, 'A', '&#x41;'],
+          [lex.TokenType.TEXT, ''],
+          [lex.TokenType.ENCODED_ENTITY, 'A', '&#X41;'],
+          [lex.TokenType.TEXT, ''],
          [lex.TokenType.EOF],
        ]);
      });

      it('should parse decimal entities', () => {
        expect(tokenizeAndHumanizeParts('&#65;')).toEqual([
-          [lex.TokenType.TEXT, 'A'],
+          [lex.TokenType.TEXT, ''],
+          [lex.TokenType.ENCODED_ENTITY, 'A', '&#65;'],
+          [lex.TokenType.TEXT, ''],
          [lex.TokenType.EOF],
        ]);
      });

      it('should store the locations', () => {
        expect(tokenizeAndHumanizeSourceSpans('a&amp;b')).toEqual([
-          [lex.TokenType.TEXT, 'a&amp;b'],
+          [lex.TokenType.TEXT, 'a'],
+          [lex.TokenType.ENCODED_ENTITY, '&amp;'],
+          [lex.TokenType.TEXT, 'b'],
          [lex.TokenType.EOF, ''],
        ]);
      });

      it('should report malformed/unknown entities', () => {
        expect(tokenizeAndHumanizeErrors('&tbo;')).toEqual([[
-          lex.TokenType.TEXT,
+          lex.TokenType.ENCODED_ENTITY,
          'Unknown entity "tbo" - use the "&#<decimal>;" or  "&#x<hex>;" syntax', '0:0'
        ]]);
        expect(tokenizeAndHumanizeErrors('&#3sdf;')).toEqual([[
-          lex.TokenType.TEXT,
+          lex.TokenType.ENCODED_ENTITY,
          'Unable to parse entity "&#3s" - decimal character reference entities must end with ";"',
          '0:4'
        ]]);
        expect(tokenizeAndHumanizeErrors('&#xasdf;')).toEqual([[
-          lex.TokenType.TEXT,
+          lex.TokenType.ENCODED_ENTITY,
          'Unable to parse entity "&#xas" - hexadecimal character reference entities must end with ";"',
          '0:5'
        ]]);

        expect(tokenizeAndHumanizeErrors('&#xABC')).toEqual([
-          [lex.TokenType.TEXT, 'Unexpected character "EOF"', '0:6']
+          [lex.TokenType.ENCODED_ENTITY, 'Unexpected character "EOF"', '0:6']
        ]);
      });
    });
@ -643,12 +657,16 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u

      it('should parse entities', () => {
        expect(tokenizeAndHumanizeParts('a&amp;b')).toEqual([
-          [lex.TokenType.TEXT, 'a&b'],
+          [lex.TokenType.TEXT, 'a'],
+          [lex.TokenType.ENCODED_ENTITY, '&', '&amp;'],
+          [lex.TokenType.TEXT, 'b'],
          [lex.TokenType.EOF],
        ]);

        expect(tokenizeAndHumanizeSourceSpans('a&amp;b')).toEqual([
-          [lex.TokenType.TEXT, 'a&amp;b'],
+          [lex.TokenType.TEXT, 'a'],
+          [lex.TokenType.ENCODED_ENTITY, '&amp;'],
+          [lex.TokenType.TEXT, 'b'],
          [lex.TokenType.EOF, ''],
        ]);
      });
@ -894,7 +912,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
        expect(tokenizeAndHumanizeParts(`<title>&amp;</title>`)).toEqual([
          [lex.TokenType.TAG_OPEN_START, '', 'title'],
          [lex.TokenType.TAG_OPEN_END],
-          [lex.TokenType.ESCAPABLE_RAW_TEXT, '&'],
+          [lex.TokenType.ESCAPABLE_RAW_TEXT, ''],
+          [lex.TokenType.ENCODED_ENTITY, '&', '&amp;'],
+          [lex.TokenType.ESCAPABLE_RAW_TEXT, ''],
          [lex.TokenType.TAG_CLOSE, '', 'title'],
          [lex.TokenType.EOF],
        ]);