From c516e252fcb199f2f503db5dc6ce563a299da010 Mon Sep 17 00:00:00 2001 From: Pete Bacon Darwin Date: Thu, 13 May 2021 17:00:56 +0100 Subject: [PATCH] refactor(compiler): support interpolation tokens when lexing attribute values (#42062) The lexer now splits interpolation tokens out from attribute value tokens. Previously the attribute value of `
` would be a single token. Now it will be three tokens: ``` ATTR_VALUE_TEXT: "Hello, " ATTR_VALUE_INTERPOLATION: "{{", " name", "}}" ATTR_VALUE_TEXT: "" ``` - ATTR_VALUE_INTERPOLATION tokens have three parts, "start marker", "expression" and "end marker". - ATTR_VALUE_INTERPOLATION tokens are always preceded and followed by TEXT tokens, even if they represent an empty string. The HTML parser has been modified to recombine these tokens to allow this refactoring to have limited effect in this commit. Further refactorings to use these new tokens will follow in subsequent commits. PR Close #42062 --- packages/compiler/src/ml_parser/lexer.ts | 76 +++++++++++-------- packages/compiler/src/ml_parser/parser.ts | 48 ++++++++---- .../test/ml_parser/html_parser_spec.ts | 13 ++++ .../compiler/test/ml_parser/lexer_spec.ts | 74 ++++++++++++------ 4 files changed, 143 insertions(+), 68 deletions(-) diff --git a/packages/compiler/src/ml_parser/lexer.ts b/packages/compiler/src/ml_parser/lexer.ts index d62a54f576..d7306a2389 100644 --- a/packages/compiler/src/ml_parser/lexer.ts +++ b/packages/compiler/src/ml_parser/lexer.ts @@ -29,7 +29,8 @@ export enum TokenType { CDATA_END, ATTR_NAME, ATTR_QUOTE, - ATTR_VALUE, + ATTR_VALUE_TEXT, + ATTR_VALUE_INTERPOLATION, DOC_TYPE, EXPANSION_FORM_START, EXPANSION_CASE_VALUE, @@ -228,7 +229,8 @@ class _Tokenizer { this._consumeTagOpen(start); } } else if (!(this._tokenizeIcu && this._tokenizeExpansionForm())) { - this._consumeText(); + this._consumeWithInterpolation( + TokenType.TEXT, TokenType.INTERPOLATION, () => this._isTextEnd()); } } catch (e) { this.handleError(e); @@ -595,29 +597,25 @@ class _Tokenizer { private _consumeAttributeValue() { let value: string; if (this._cursor.peek() === chars.$SQ || this._cursor.peek() === chars.$DQ) { - this._beginToken(TokenType.ATTR_QUOTE); const quoteChar = this._cursor.peek(); - this._cursor.advance(); - this._endToken([String.fromCodePoint(quoteChar)]); - this._beginToken(TokenType.ATTR_VALUE); - const parts: string[] = []; - while (this._cursor.peek() !== quoteChar) { - parts.push(this._readChar(true)); - } - value = parts.join(''); - this._endToken([this._processCarriageReturns(value)]); - this._beginToken(TokenType.ATTR_QUOTE); - this._cursor.advance(); - this._endToken([String.fromCodePoint(quoteChar)]); + this._consumeQuote(quoteChar); + this._consumeWithInterpolation( + TokenType.ATTR_VALUE_TEXT, TokenType.ATTR_VALUE_INTERPOLATION, + () => this._cursor.peek() === quoteChar); + this._consumeQuote(quoteChar); } else { - this._beginToken(TokenType.ATTR_VALUE); - const valueStart = this._cursor.clone(); - this._requireCharCodeUntilFn(isNameEnd, 1); - value = this._cursor.getChars(valueStart); - this._endToken([this._processCarriageReturns(value)]); + const endPredicate = () => isNameEnd(this._cursor.peek()); + this._consumeWithInterpolation( + TokenType.ATTR_VALUE_TEXT, TokenType.ATTR_VALUE_INTERPOLATION, endPredicate); } } + private _consumeQuote(quoteChar: number) { + this._beginToken(TokenType.ATTR_QUOTE); + this._requireCharCode(quoteChar); + this._endToken([String.fromCodePoint(quoteChar)]); + } + private _consumeTagOpenEnd() { const tokenType = this._attemptCharCode(chars.$SLASH) ? TokenType.TAG_OPEN_END_VOID : TokenType.TAG_OPEN_END; @@ -696,21 +694,31 @@ class _Tokenizer { this._expansionCaseStack.pop(); } - private _consumeText() { - this._beginToken(TokenType.TEXT); + /** + * Consume a string that may contain interpolation expressions. + * The first token consumed will be of `tokenType` and then there will be alternating + * `interpolationTokenType` and `tokenType` tokens until the `endPredicate()` returns true. + * + * @param textTokenType the kind of tokens to interleave around interpolation tokens. + * @param interpolationTokenType the kind of tokens that contain interpolation. + * @param endPredicate a function that should return true when we should stop consuming. + */ + private _consumeWithInterpolation( + textTokenType: TokenType, interpolationTokenType: TokenType, endPredicate: () => boolean) { + this._beginToken(textTokenType); const parts: string[] = []; - do { + while (!endPredicate()) { const current = this._cursor.clone(); if (this._interpolationConfig && this._attemptStr(this._interpolationConfig.start)) { this._endToken([this._processCarriageReturns(parts.join(''))], current); - this._consumeInterpolation(current); + this._consumeInterpolation(interpolationTokenType, current); parts.length = 0; - this._beginToken(TokenType.TEXT); + this._beginToken(textTokenType); } else { parts.push(this._readChar(true)); } - } while (!this._isTextEnd()); + } // It is possible that an interpolation was started but not ended inside this text token. // Make sure that we reset the state of the lexer correctly. @@ -719,14 +727,15 @@ class _Tokenizer { this._endToken([this._processCarriageReturns(parts.join(''))]); } - private _consumeInterpolation(interpolationStart: CharacterCursor) { + private _consumeInterpolation( + interpolationTokenType: TokenType, interpolationStart: CharacterCursor) { const parts: string[] = []; - this._beginToken(TokenType.INTERPOLATION, interpolationStart); + this._beginToken(interpolationTokenType, interpolationStart); parts.push(this._interpolationConfig.start); // Find the end of the interpolation, ignoring content inside quotes. const expressionStart = this._cursor.clone(); - let inQuote: string|null = null; + let inQuote: number|null = null; let inComment = false; while (this._cursor.peek() !== chars.$EOF) { const current = this._cursor.clone(); @@ -752,14 +761,15 @@ class _Tokenizer { } } - const char = this._readChar(true); - if (char === '\\') { + const char = this._cursor.peek(); + this._cursor.advance(); + if (char === chars.$BACKSLASH) { // Skip the next character because it was escaped. - this._readChar(true); + this._cursor.advance(); } else if (char === inQuote) { // Exiting the current quoted string inQuote = null; - } else if (!inComment && /['"`]/.test(char)) { + } else if (!inComment && chars.isQuote(char)) { // Entering a new quoted string inQuote = char; } diff --git a/packages/compiler/src/ml_parser/parser.ts b/packages/compiler/src/ml_parser/parser.ts index fd01357d43..9ac0b944e4 100644 --- a/packages/compiler/src/ml_parser/parser.ts +++ b/packages/compiler/src/ml_parser/parser.ts @@ -6,7 +6,7 @@ * found in the LICENSE file at https://angular.io/license */ -import {ParseError, ParseSourceSpan} from '../parse_util'; +import {ParseError, ParseLocation, ParseSourceSpan} from '../parse_util'; import * as html from './ast'; import {NAMED_ENTITIES} from './entities'; @@ -362,27 +362,49 @@ class _TreeBuilder { private _consumeAttr(attrName: lex.Token): html.Attribute { const fullName = mergeNsAndName(attrName.parts[0], attrName.parts[1]); - let end = attrName.sourceSpan.end; - let value = ''; - let valueSpan: ParseSourceSpan = undefined!; + let attrEnd = attrName.sourceSpan.end; + + // Consume any quote if (this._peek.type === lex.TokenType.ATTR_QUOTE) { this._advance(); } - if (this._peek.type === lex.TokenType.ATTR_VALUE) { - const valueToken = this._advance(); - value = valueToken.parts[0]; - end = valueToken.sourceSpan.end; - valueSpan = valueToken.sourceSpan; + + // Consume the value + let value = ''; + let valueStartSpan: ParseSourceSpan|undefined = undefined; + let valueEnd: ParseLocation|undefined = undefined; + if (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT) { + valueStartSpan = this._peek.sourceSpan; + valueEnd = this._peek.sourceSpan.end; + // For now we are recombining text and interpolation tokens + while (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT || + this._peek.type === lex.TokenType.ATTR_VALUE_INTERPOLATION) { + let valueToken = this._advance(); + if (valueToken.type === lex.TokenType.ATTR_VALUE_INTERPOLATION) { + // For backward compatibility we decode HTML entities that appear in interpolation + // expressions. This is arguably a bug, but it could be a considerable breaking change to + // fix it. It should be addressed in a larger project to refactor the entire parser/lexer + // chain after View Engine has been removed. + value += valueToken.parts.join('').replace(/&([^;]+);/g, decodeEntity); + } else { + value += valueToken.parts.join(''); + } + valueEnd = attrEnd = valueToken.sourceSpan.end; + } } + + // Consume any quote if (this._peek.type === lex.TokenType.ATTR_QUOTE) { const quoteToken = this._advance(); - end = quoteToken.sourceSpan.end; + attrEnd = quoteToken.sourceSpan.end; } - const keySpan = new ParseSourceSpan(attrName.sourceSpan.start, attrName.sourceSpan.end); + + const valueSpan = valueStartSpan && valueEnd && + new ParseSourceSpan(valueStartSpan.start, valueEnd, valueStartSpan.fullStart); return new html.Attribute( fullName, value, - new ParseSourceSpan(attrName.sourceSpan.start, end, attrName.sourceSpan.fullStart), keySpan, - valueSpan); + new ParseSourceSpan(attrName.sourceSpan.start, attrEnd, attrName.sourceSpan.fullStart), + attrName.sourceSpan, valueSpan); } private _getParentElement(): html.Element|null { diff --git a/packages/compiler/test/ml_parser/html_parser_spec.ts b/packages/compiler/test/ml_parser/html_parser_spec.ts index 279bca60d3..4fbfa75692 100644 --- a/packages/compiler/test/ml_parser/html_parser_spec.ts +++ b/packages/compiler/test/ml_parser/html_parser_spec.ts @@ -250,6 +250,19 @@ import {humanizeDom, humanizeDomSourceSpans, humanizeLineColumn, humanizeNodes} ]); }); + it('should decode HTML entities in interpolated attributes', () => { + // Note that the detail of decoding corner-cases is tested in the + // "should decode HTML entities in interpolations" spec. + expect(humanizeDomSourceSpans(parser.parse('
', 'TestComp'))) + .toEqual([ + [ + html.Element, 'div', 0, '
', '
', + '
' + ], + [html.Attribute, 'foo', '{{&}}', 'foo="{{&}}"'] + ]); + }); + it('should normalize line endings within attribute values', () => { const result = parser.parse('
', 'TestComp'); diff --git a/packages/compiler/test/ml_parser/lexer_spec.ts b/packages/compiler/test/ml_parser/lexer_spec.ts index 54005b28ba..835d59970d 100644 --- a/packages/compiler/test/ml_parser/lexer_spec.ts +++ b/packages/compiler/test/ml_parser/lexer_spec.ts @@ -257,7 +257,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u [lex.TokenType.INCOMPLETE_TAG_OPEN, '')).toEqual([ [lex.TokenType.TAG_OPEN_START, '', 't'], [lex.TokenType.ATTR_NAME, '', 'a'], - [lex.TokenType.ATTR_VALUE, 'b'], + [lex.TokenType.ATTR_VALUE_TEXT, 'b'], + [lex.TokenType.TAG_OPEN_END], + [lex.TokenType.EOF], + ]); + }); + + it('should parse attributes with unquoted interpolation value', () => { + expect(tokenizeAndHumanizeParts('')).toEqual([ + [lex.TokenType.TAG_OPEN_START, '', 'a'], + [lex.TokenType.ATTR_NAME, '', 'a'], + [lex.TokenType.ATTR_VALUE_TEXT, ''], + [lex.TokenType.ATTR_VALUE_INTERPOLATION, '{{', 'link.text', '}}'], + [lex.TokenType.ATTR_VALUE_TEXT, ''], + [lex.TokenType.TAG_OPEN_END], + [lex.TokenType.EOF], + ]); + }); + + it('should parse attributes with empty quoted value', () => { + expect(tokenizeAndHumanizeParts('')).toEqual([ + [lex.TokenType.TAG_OPEN_START, '', 't'], + [lex.TokenType.ATTR_NAME, '', 'a'], + [lex.TokenType.ATTR_QUOTE, '"'], + [lex.TokenType.ATTR_VALUE_TEXT, ''], + [lex.TokenType.ATTR_QUOTE, '"'], [lex.TokenType.TAG_OPEN_END], [lex.TokenType.EOF], ]); @@ -366,7 +396,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u expect(tokenizeAndHumanizeParts('')).toEqual([ [lex.TokenType.TAG_OPEN_START, '', 't'], [lex.TokenType.ATTR_NAME, '', 'a'], - [lex.TokenType.ATTR_VALUE, 'b'], + [lex.TokenType.ATTR_VALUE_TEXT, 'b'], [lex.TokenType.TAG_OPEN_END], [lex.TokenType.EOF], ]); @@ -377,7 +407,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u [lex.TokenType.TAG_OPEN_START, '', 't'], [lex.TokenType.ATTR_NAME, '', 'a'], [lex.TokenType.ATTR_QUOTE, '"'], - [lex.TokenType.ATTR_VALUE, 'AA'], + [lex.TokenType.ATTR_VALUE_TEXT, 'AA'], [lex.TokenType.ATTR_QUOTE, '"'], [lex.TokenType.TAG_OPEN_END], [lex.TokenType.EOF], @@ -389,11 +419,11 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u [lex.TokenType.TAG_OPEN_START, '', 't'], [lex.TokenType.ATTR_NAME, '', 'a'], [lex.TokenType.ATTR_QUOTE, '"'], - [lex.TokenType.ATTR_VALUE, '&'], + [lex.TokenType.ATTR_VALUE_TEXT, '&'], [lex.TokenType.ATTR_QUOTE, '"'], [lex.TokenType.ATTR_NAME, '', 'b'], [lex.TokenType.ATTR_QUOTE, '"'], - [lex.TokenType.ATTR_VALUE, 'c&&d'], + [lex.TokenType.ATTR_VALUE_TEXT, 'c&&d'], [lex.TokenType.ATTR_QUOTE, '"'], [lex.TokenType.TAG_OPEN_END], [lex.TokenType.EOF], @@ -405,7 +435,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u [lex.TokenType.TAG_OPEN_START, '', 't'], [lex.TokenType.ATTR_NAME, '', 'a'], [lex.TokenType.ATTR_QUOTE, '"'], - [lex.TokenType.ATTR_VALUE, 'b && c &'], + [lex.TokenType.ATTR_VALUE_TEXT, 'b && c &'], [lex.TokenType.ATTR_QUOTE, '"'], [lex.TokenType.TAG_OPEN_END], [lex.TokenType.EOF], @@ -417,7 +447,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u [lex.TokenType.TAG_OPEN_START, '', 't'], [lex.TokenType.ATTR_NAME, '', 'a'], [lex.TokenType.ATTR_QUOTE, '\''], - [lex.TokenType.ATTR_VALUE, 't\ne\ns\nt'], + [lex.TokenType.ATTR_VALUE_TEXT, 't\ne\ns\nt'], [lex.TokenType.ATTR_QUOTE, '\''], [lex.TokenType.TAG_OPEN_END], [lex.TokenType.EOF], @@ -428,7 +458,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u expect(tokenizeAndHumanizeSourceSpans('')).toEqual([ [lex.TokenType.TAG_OPEN_START, ''], [lex.TokenType.EOF, ''], ]); @@ -436,13 +466,13 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u it('should report missing closing single quote', () => { expect(tokenizeAndHumanizeErrors('')).toEqual([ - [lex.TokenType.ATTR_VALUE, 'Unexpected character "EOF"', '0:8'], + [lex.TokenType.ATTR_VALUE_TEXT, 'Unexpected character "EOF"', '0:8'], ]); }); it('should report missing closing double quote', () => { expect(tokenizeAndHumanizeErrors('