diff --git a/packages/compiler/src/ml_parser/lexer.ts b/packages/compiler/src/ml_parser/lexer.ts index d7306a2389..13d3a6bfba 100644 --- a/packages/compiler/src/ml_parser/lexer.ts +++ b/packages/compiler/src/ml_parser/lexer.ts @@ -23,6 +23,7 @@ export enum TokenType { ESCAPABLE_RAW_TEXT, RAW_TEXT, INTERPOLATION, + ENCODED_ENTITY, COMMENT_START, COMMENT_END, CDATA_START, @@ -395,19 +396,16 @@ class _Tokenizer { } } - private _readChar(decodeEntities: boolean): string { - if (decodeEntities && this._cursor.peek() === chars.$AMPERSAND) { - return this._decodeEntity(); - } else { - // Don't rely upon reading directly from `_input` as the actual char value - // may have been generated from an escape sequence. - const char = String.fromCodePoint(this._cursor.peek()); - this._cursor.advance(); - return char; - } + private _readChar(): string { + // Don't rely upon reading directly from `_input` as the actual char value + // may have been generated from an escape sequence. + const char = String.fromCodePoint(this._cursor.peek()); + this._cursor.advance(); + return char; } - private _decodeEntity(): string { + private _consumeEntity(textTokenType: TokenType): void { + this._beginToken(TokenType.ENCODED_ENTITY); const start = this._cursor.clone(); this._cursor.advance(); if (this._attemptCharCode(chars.$HASH)) { @@ -427,7 +425,7 @@ class _Tokenizer { this._cursor.advance(); try { const charCode = parseInt(strNum, isHex ? 16 : 10); - return String.fromCharCode(charCode); + this._endToken([String.fromCharCode(charCode), this._cursor.getChars(start)]); } catch { throw this._createError( _unknownEntityErrorMsg(this._cursor.getChars(start)), this._cursor.getSpan()); @@ -436,21 +434,25 @@ class _Tokenizer { const nameStart = this._cursor.clone(); this._attemptCharCodeUntilFn(isNamedEntityEnd); if (this._cursor.peek() != chars.$SEMICOLON) { + // No semicolon was found so abort the encoded entity token that was in progress, and treat + // this as a text token + this._beginToken(textTokenType, start); this._cursor = nameStart; - return '&'; + this._endToken(['&']); + } else { + const name = this._cursor.getChars(nameStart); + this._cursor.advance(); + const char = NAMED_ENTITIES[name]; + if (!char) { + throw this._createError(_unknownEntityErrorMsg(name), this._cursor.getSpan(start)); + } + this._endToken([char, `&${name};`]); } - const name = this._cursor.getChars(nameStart); - this._cursor.advance(); - const char = NAMED_ENTITIES[name]; - if (!char) { - throw this._createError(_unknownEntityErrorMsg(name), this._cursor.getSpan(start)); - } - return char; } } - private _consumeRawText(decodeEntities: boolean, endMarkerPredicate: () => boolean): Token { - this._beginToken(decodeEntities ? TokenType.ESCAPABLE_RAW_TEXT : TokenType.RAW_TEXT); + private _consumeRawText(consumeEntities: boolean, endMarkerPredicate: () => boolean): void { + this._beginToken(consumeEntities ? TokenType.ESCAPABLE_RAW_TEXT : TokenType.RAW_TEXT); const parts: string[] = []; while (true) { const tagCloseStart = this._cursor.clone(); @@ -459,9 +461,16 @@ class _Tokenizer { if (foundEndMarker) { break; } - parts.push(this._readChar(decodeEntities)); + if (consumeEntities && this._cursor.peek() === chars.$AMPERSAND) { + this._endToken([this._processCarriageReturns(parts.join(''))]); + parts.length = 0; + this._consumeEntity(TokenType.ESCAPABLE_RAW_TEXT); + this._beginToken(TokenType.ESCAPABLE_RAW_TEXT); + } else { + parts.push(this._readChar()); + } } - return this._endToken([this._processCarriageReturns(parts.join(''))]); + this._endToken([this._processCarriageReturns(parts.join(''))]); } private _consumeComment(start: CharacterCursor) { @@ -563,8 +572,8 @@ class _Tokenizer { } } - private _consumeRawTextWithTagClose(prefix: string, tagName: string, decodeEntities: boolean) { - this._consumeRawText(decodeEntities, () => { + private _consumeRawTextWithTagClose(prefix: string, tagName: string, consumeEntities: boolean) { + this._consumeRawText(consumeEntities, () => { if (!this._attemptCharCode(chars.$LT)) return false; if (!this._attemptCharCode(chars.$SLASH)) return false; this._attemptCharCodeUntilFn(isNotWhitespace); @@ -712,11 +721,16 @@ class _Tokenizer { const current = this._cursor.clone(); if (this._interpolationConfig && this._attemptStr(this._interpolationConfig.start)) { this._endToken([this._processCarriageReturns(parts.join(''))], current); - this._consumeInterpolation(interpolationTokenType, current); parts.length = 0; + this._consumeInterpolation(interpolationTokenType, current); + this._beginToken(textTokenType); + } else if (this._cursor.peek() === chars.$AMPERSAND) { + this._endToken([this._processCarriageReturns(parts.join(''))]); + parts.length = 0; + this._consumeEntity(textTokenType); this._beginToken(textTokenType); } else { - parts.push(this._readChar(true)); + parts.push(this._readChar()); } } @@ -895,7 +909,9 @@ function mergeTextTokens(srcTokens: Token[]): Token[] { let lastDstToken: Token|undefined = undefined; for (let i = 0; i < srcTokens.length; i++) { const token = srcTokens[i]; - if (lastDstToken && lastDstToken.type == TokenType.TEXT && token.type == TokenType.TEXT) { + if ((lastDstToken && lastDstToken.type == TokenType.TEXT && token.type == TokenType.TEXT) || + (lastDstToken && lastDstToken.type == TokenType.ATTR_VALUE_TEXT && + token.type == TokenType.ATTR_VALUE_TEXT)) { lastDstToken.parts[0]! += token.parts[0]; lastDstToken.sourceSpan.end = token.sourceSpan.end; } else { diff --git a/packages/compiler/src/ml_parser/parser.ts b/packages/compiler/src/ml_parser/parser.ts index 9ac0b944e4..4d5f18d440 100644 --- a/packages/compiler/src/ml_parser/parser.ts +++ b/packages/compiler/src/ml_parser/parser.ts @@ -226,20 +226,21 @@ class _TreeBuilder { } } - // For now recombine text and interpolation tokens - if (this._peek.type === lex.TokenType.INTERPOLATION) { - while (this._peek.type === lex.TokenType.INTERPOLATION || - this._peek.type === lex.TokenType.TEXT) { - token = this._advance(); - if (token.type === lex.TokenType.INTERPOLATION) { - // For backward compatibility we decode HTML entities that appear in interpolation - // expressions. This is arguably a bug, but it could be a considerable breaking change to - // fix it. It should be addressed in a larger project to refactor the entire parser/lexer - // chain after View Engine has been removed. - text += token.parts.join('').replace(/&([^;]+);/g, decodeEntity); - } else { - text += token.parts.join(''); - } + // For now recombine text, interpolation and entity tokens + while (this._peek.type === lex.TokenType.INTERPOLATION || + this._peek.type === lex.TokenType.TEXT || + this._peek.type === lex.TokenType.ENCODED_ENTITY) { + token = this._advance(); + if (token.type === lex.TokenType.INTERPOLATION) { + // For backward compatibility we decode HTML entities that appear in interpolation + // expressions. This is arguably a bug, but it could be a considerable breaking change to + // fix it. It should be addressed in a larger project to refactor the entire parser/lexer + // chain after View Engine has been removed. + text += token.parts.join('').replace(/&([^;]+);/g, decodeEntity); + } else if (token.type === lex.TokenType.ENCODED_ENTITY) { + text += token.parts[0]; + } else { + text += token.parts.join(''); } } @@ -369,16 +370,17 @@ class _TreeBuilder { this._advance(); } - // Consume the value + // Consume the attribute value let value = ''; let valueStartSpan: ParseSourceSpan|undefined = undefined; let valueEnd: ParseLocation|undefined = undefined; if (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT) { valueStartSpan = this._peek.sourceSpan; valueEnd = this._peek.sourceSpan.end; - // For now we are recombining text and interpolation tokens + // For now recombine text, interpolation and entity tokens while (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT || - this._peek.type === lex.TokenType.ATTR_VALUE_INTERPOLATION) { + this._peek.type === lex.TokenType.ATTR_VALUE_INTERPOLATION || + this._peek.type === lex.TokenType.ENCODED_ENTITY) { let valueToken = this._advance(); if (valueToken.type === lex.TokenType.ATTR_VALUE_INTERPOLATION) { // For backward compatibility we decode HTML entities that appear in interpolation @@ -386,6 +388,8 @@ class _TreeBuilder { // fix it. It should be addressed in a larger project to refactor the entire parser/lexer // chain after View Engine has been removed. value += valueToken.parts.join('').replace(/&([^;]+);/g, decodeEntity); + } else if (valueToken.type === lex.TokenType.ENCODED_ENTITY) { + value += valueToken.parts[0]; } else { value += valueToken.parts.join(''); } diff --git a/packages/compiler/test/ml_parser/lexer_spec.ts b/packages/compiler/test/ml_parser/lexer_spec.ts index 835d59970d..bc8559221b 100644 --- a/packages/compiler/test/ml_parser/lexer_spec.ts +++ b/packages/compiler/test/ml_parser/lexer_spec.ts @@ -407,7 +407,11 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u [lex.TokenType.TAG_OPEN_START, '', 't'], [lex.TokenType.ATTR_NAME, '', 'a'], [lex.TokenType.ATTR_QUOTE, '"'], - [lex.TokenType.ATTR_VALUE_TEXT, 'AA'], + [lex.TokenType.ATTR_VALUE_TEXT, ''], + [lex.TokenType.ENCODED_ENTITY, 'A', 'A'], + [lex.TokenType.ATTR_VALUE_TEXT, ''], + [lex.TokenType.ENCODED_ENTITY, 'A', 'A'], + [lex.TokenType.ATTR_VALUE_TEXT, ''], [lex.TokenType.ATTR_QUOTE, '"'], [lex.TokenType.TAG_OPEN_END], [lex.TokenType.EOF], @@ -522,50 +526,60 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u describe('entities', () => { it('should parse named entities', () => { expect(tokenizeAndHumanizeParts('a&b')).toEqual([ - [lex.TokenType.TEXT, 'a&b'], + [lex.TokenType.TEXT, 'a'], + [lex.TokenType.ENCODED_ENTITY, '&', '&'], + [lex.TokenType.TEXT, 'b'], [lex.TokenType.EOF], ]); }); it('should parse hexadecimal entities', () => { expect(tokenizeAndHumanizeParts('AA')).toEqual([ - [lex.TokenType.TEXT, 'AA'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.ENCODED_ENTITY, 'A', 'A'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.ENCODED_ENTITY, 'A', 'A'], + [lex.TokenType.TEXT, ''], [lex.TokenType.EOF], ]); }); it('should parse decimal entities', () => { expect(tokenizeAndHumanizeParts('A')).toEqual([ - [lex.TokenType.TEXT, 'A'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.ENCODED_ENTITY, 'A', 'A'], + [lex.TokenType.TEXT, ''], [lex.TokenType.EOF], ]); }); it('should store the locations', () => { expect(tokenizeAndHumanizeSourceSpans('a&b')).toEqual([ - [lex.TokenType.TEXT, 'a&b'], + [lex.TokenType.TEXT, 'a'], + [lex.TokenType.ENCODED_ENTITY, '&'], + [lex.TokenType.TEXT, 'b'], [lex.TokenType.EOF, ''], ]); }); it('should report malformed/unknown entities', () => { expect(tokenizeAndHumanizeErrors('&tbo;')).toEqual([[ - lex.TokenType.TEXT, + lex.TokenType.ENCODED_ENTITY, 'Unknown entity "tbo" - use the "&#;" or "&#x;" syntax', '0:0' ]]); expect(tokenizeAndHumanizeErrors('sdf;')).toEqual([[ - lex.TokenType.TEXT, + lex.TokenType.ENCODED_ENTITY, 'Unable to parse entity "s" - decimal character reference entities must end with ";"', '0:4' ]]); expect(tokenizeAndHumanizeErrors(' sdf;')).toEqual([[ - lex.TokenType.TEXT, + lex.TokenType.ENCODED_ENTITY, 'Unable to parse entity " s" - hexadecimal character reference entities must end with ";"', '0:5' ]]); expect(tokenizeAndHumanizeErrors('઼')).toEqual([ - [lex.TokenType.TEXT, 'Unexpected character "EOF"', '0:6'] + [lex.TokenType.ENCODED_ENTITY, 'Unexpected character "EOF"', '0:6'] ]); }); }); @@ -643,12 +657,16 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u it('should parse entities', () => { expect(tokenizeAndHumanizeParts('a&b')).toEqual([ - [lex.TokenType.TEXT, 'a&b'], + [lex.TokenType.TEXT, 'a'], + [lex.TokenType.ENCODED_ENTITY, '&', '&'], + [lex.TokenType.TEXT, 'b'], [lex.TokenType.EOF], ]); expect(tokenizeAndHumanizeSourceSpans('a&b')).toEqual([ - [lex.TokenType.TEXT, 'a&b'], + [lex.TokenType.TEXT, 'a'], + [lex.TokenType.ENCODED_ENTITY, '&'], + [lex.TokenType.TEXT, 'b'], [lex.TokenType.EOF, ''], ]); }); @@ -894,7 +912,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u expect(tokenizeAndHumanizeParts(`&`)).toEqual([ [lex.TokenType.TAG_OPEN_START, '', 'title'], [lex.TokenType.TAG_OPEN_END], - [lex.TokenType.ESCAPABLE_RAW_TEXT, '&'], + [lex.TokenType.ESCAPABLE_RAW_TEXT, ''], + [lex.TokenType.ENCODED_ENTITY, '&', '&'], + [lex.TokenType.ESCAPABLE_RAW_TEXT, ''], [lex.TokenType.TAG_CLOSE, '', 'title'], [lex.TokenType.EOF], ]);