From 9de65dbdceac3077881fbc49717f33d0f379e21d Mon Sep 17 00:00:00 2001
From: Pete Bacon Darwin <pete@bacondarwin.com>
Date: Mon, 21 Jun 2021 21:20:44 +0100
Subject: [PATCH] fix(compiler): should not break a text token on a non-valid
 start tag (#42605)

Previously the lexer would break out of consuming a text token if it contains
a `<` character. Then if the next characters did not indicate an HTML syntax
item, such as a tag or comment, then it would start a new text token. These
consecutive text tokens are then merged into each other in a post tokenization
step.

In the commit before this, interpolation no longer leaks across text tokens.
The approach given above to handling `<` characters that appear in text is
no longer adequate. This change ensures that the lexer only breaks out of
a text token if the next characters indicate a valid HTML tag, comment,
CDATA etc.

PR Close #42605
---
 packages/compiler/src/ml_parser/lexer.ts      | 21 +++++++++-
 .../compiler/test/ml_parser/lexer_spec.ts     | 38 ++++++++++++++++++-
 2 files changed, 57 insertions(+), 2 deletions(-)
diff --git a/packages/compiler/src/ml_parser/lexer.ts b/packages/compiler/src/ml_parser/lexer.ts
index 75da6732e5..8d832f4d2a 100644
--- a/packages/compiler/src/ml_parser/lexer.ts
+++ b/packages/compiler/src/ml_parser/lexer.ts
@@ -721,7 +721,7 @@ class _Tokenizer {
   }
 
   private _isTextEnd(): boolean {
-    if (this._cursor.peek() === chars.$LT || this._cursor.peek() === chars.$EOF) {
+    if (this._isTagStart() || this._cursor.peek() === chars.$EOF) {
       return true;
     }
 
@@ -740,6 +740,25 @@ class _Tokenizer {
     return false;
   }
 
+  /**
+   * Returns true if the current cursor is pointing to the start of a tag
+   * (opening/closing/comments/cdata/etc).
+   */
+  private _isTagStart(): boolean {
+    if (this._cursor.peek() === chars.$LT) {
+      // We assume that `<` followed by whitespace is not the start of an HTML element.
+      const tmp = this._cursor.clone();
+      tmp.advance();
+      // If the next character is alphabetic, ! nor / then it is a tag start
+      const code = tmp.peek();
+      if ((chars.$a <= code && code <= chars.$z) || (chars.$A <= code && code <= chars.$Z) ||
+          code === chars.$SLASH || code === chars.$BANG) {
+        return true;
+      }
+    }
+    return false;
+  }
+
   private _readUntil(char: number): string {
     const start = this._cursor.clone();
     this._attemptUntilChar(char);
diff --git a/packages/compiler/test/ml_parser/lexer_spec.ts b/packages/compiler/test/ml_parser/lexer_spec.ts
index 00c19418ee..5c795ed959 100644
--- a/packages/compiler/test/ml_parser/lexer_spec.ts
+++ b/packages/compiler/test/ml_parser/lexer_spec.ts
@@ -612,7 +612,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
         ]);
       });
 
-      it('should parse valid start tag in interpolation', () => {
+      it('should break out of interpolation in text token on valid start tag', () => {
         expect(tokenizeAndHumanizeParts('{{ a <b && c > d }}')).toEqual([
           [lex.TokenType.TEXT, '{{ a '],
           [lex.TokenType.TAG_OPEN_START, '', 'b'],
@@ -624,6 +624,42 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
         ]);
       });
 
+      it('should break out of interpolation in text token on valid comment', () => {
+        expect(tokenizeAndHumanizeParts('{{ a }<!---->}')).toEqual([
+          [lex.TokenType.TEXT, '{{ a }'],
+          [lex.TokenType.COMMENT_START],
+          [lex.TokenType.RAW_TEXT, ''],
+          [lex.TokenType.COMMENT_END],
+          [lex.TokenType.TEXT, '}'],
+          [lex.TokenType.EOF],
+        ]);
+      });
+
+      it('should break out of interpolation in text token on valid CDATA', () => {
+        expect(tokenizeAndHumanizeParts('{{ a }<![CDATA[]]>}')).toEqual([
+          [lex.TokenType.TEXT, '{{ a }'],
+          [lex.TokenType.CDATA_START],
+          [lex.TokenType.RAW_TEXT, ''],
+          [lex.TokenType.CDATA_END],
+          [lex.TokenType.TEXT, '}'],
+          [lex.TokenType.EOF],
+        ]);
+      });
+
+      it('should ignore invalid start tag in interpolation', () => {
+        // Note that if the `<=` is considered an "end of text" then the following `{` would
+        // incorrectly be considered part of an ICU.
+        expect(tokenizeAndHumanizeParts(`<code>{{'<={'}}</code>`, {tokenizeExpansionForms: true}))
+            .toEqual([
+              [lex.TokenType.TAG_OPEN_START, '', 'code'],
+              [lex.TokenType.TAG_OPEN_END],
+              [lex.TokenType.TEXT, '{{\'<={\'}}'],
+              [lex.TokenType.TAG_CLOSE, '', 'code'],
+              [lex.TokenType.EOF],
+            ]);
+      });
+
+
       it('should parse start tags quotes in place of an attribute name as text', () => {
         expect(tokenizeAndHumanizeParts('<t ">')).toEqual([
           [lex.TokenType.INCOMPLETE_TAG_OPEN, '', 't'],