From 9de65dbdceac3077881fbc49717f33d0f379e21d Mon Sep 17 00:00:00 2001 From: Pete Bacon Darwin Date: Mon, 21 Jun 2021 21:20:44 +0100 Subject: [PATCH] fix(compiler): should not break a text token on a non-valid start tag (#42605) Previously the lexer would break out of consuming a text token if it contains a `<` character. Then if the next characters did not indicate an HTML syntax item, such as a tag or comment, then it would start a new text token. These consecutive text tokens are then merged into each other in a post tokenization step. In the commit before this, interpolation no longer leaks across text tokens. The approach given above to handling `<` characters that appear in text is no longer adequate. This change ensures that the lexer only breaks out of a text token if the next characters indicate a valid HTML tag, comment, CDATA etc. PR Close #42605 --- packages/compiler/src/ml_parser/lexer.ts | 21 +++++++++- .../compiler/test/ml_parser/lexer_spec.ts | 38 ++++++++++++++++++- 2 files changed, 57 insertions(+), 2 deletions(-) diff --git a/packages/compiler/src/ml_parser/lexer.ts b/packages/compiler/src/ml_parser/lexer.ts index 75da6732e5..8d832f4d2a 100644 --- a/packages/compiler/src/ml_parser/lexer.ts +++ b/packages/compiler/src/ml_parser/lexer.ts @@ -721,7 +721,7 @@ class _Tokenizer { } private _isTextEnd(): boolean { - if (this._cursor.peek() === chars.$LT || this._cursor.peek() === chars.$EOF) { + if (this._isTagStart() || this._cursor.peek() === chars.$EOF) { return true; } @@ -740,6 +740,25 @@ class _Tokenizer { return false; } + /** + * Returns true if the current cursor is pointing to the start of a tag + * (opening/closing/comments/cdata/etc). + */ + private _isTagStart(): boolean { + if (this._cursor.peek() === chars.$LT) { + // We assume that `<` followed by whitespace is not the start of an HTML element. + const tmp = this._cursor.clone(); + tmp.advance(); + // If the next character is alphabetic, ! nor / then it is a tag start + const code = tmp.peek(); + if ((chars.$a <= code && code <= chars.$z) || (chars.$A <= code && code <= chars.$Z) || + code === chars.$SLASH || code === chars.$BANG) { + return true; + } + } + return false; + } + private _readUntil(char: number): string { const start = this._cursor.clone(); this._attemptUntilChar(char); diff --git a/packages/compiler/test/ml_parser/lexer_spec.ts b/packages/compiler/test/ml_parser/lexer_spec.ts index 00c19418ee..5c795ed959 100644 --- a/packages/compiler/test/ml_parser/lexer_spec.ts +++ b/packages/compiler/test/ml_parser/lexer_spec.ts @@ -612,7 +612,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u ]); }); - it('should parse valid start tag in interpolation', () => { + it('should break out of interpolation in text token on valid start tag', () => { expect(tokenizeAndHumanizeParts('{{ a d }}')).toEqual([ [lex.TokenType.TEXT, '{{ a '], [lex.TokenType.TAG_OPEN_START, '', 'b'], @@ -624,6 +624,42 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u ]); }); + it('should break out of interpolation in text token on valid comment', () => { + expect(tokenizeAndHumanizeParts('{{ a }}')).toEqual([ + [lex.TokenType.TEXT, '{{ a }'], + [lex.TokenType.COMMENT_START], + [lex.TokenType.RAW_TEXT, ''], + [lex.TokenType.COMMENT_END], + [lex.TokenType.TEXT, '}'], + [lex.TokenType.EOF], + ]); + }); + + it('should break out of interpolation in text token on valid CDATA', () => { + expect(tokenizeAndHumanizeParts('{{ a }}')).toEqual([ + [lex.TokenType.TEXT, '{{ a }'], + [lex.TokenType.CDATA_START], + [lex.TokenType.RAW_TEXT, ''], + [lex.TokenType.CDATA_END], + [lex.TokenType.TEXT, '}'], + [lex.TokenType.EOF], + ]); + }); + + it('should ignore invalid start tag in interpolation', () => { + // Note that if the `<=` is considered an "end of text" then the following `{` would + // incorrectly be considered part of an ICU. + expect(tokenizeAndHumanizeParts(`{{'<={'}}`, {tokenizeExpansionForms: true})) + .toEqual([ + [lex.TokenType.TAG_OPEN_START, '', 'code'], + [lex.TokenType.TAG_OPEN_END], + [lex.TokenType.TEXT, '{{\'<={\'}}'], + [lex.TokenType.TAG_CLOSE, '', 'code'], + [lex.TokenType.EOF], + ]); + }); + + it('should parse start tags quotes in place of an attribute name as text', () => { expect(tokenizeAndHumanizeParts('')).toEqual([ [lex.TokenType.INCOMPLETE_TAG_OPEN, '', 't'],