From 8d8ab4775c1ff197ad101cbd33690bc8620a3c56 Mon Sep 17 00:00:00 2001 From: atscott Date: Tue, 3 Aug 2021 14:49:05 -0700 Subject: [PATCH] Revert "refactor(compiler): support encoded entity tokens when lexing markup (#42062)" (#43033) This reverts commit 942b24d5ea5d36ad4e53ed435bda35a6ae6876c9. PR Close #43033 --- packages/compiler/src/ml_parser/lexer.ts | 72 ++++++++----------- packages/compiler/src/ml_parser/parser.ts | 38 +++++----- .../compiler/test/ml_parser/lexer_spec.ts | 44 ++++-------- 3 files changed, 57 insertions(+), 97 deletions(-) diff --git a/packages/compiler/src/ml_parser/lexer.ts b/packages/compiler/src/ml_parser/lexer.ts index 13d3a6bfba..d7306a2389 100644 --- a/packages/compiler/src/ml_parser/lexer.ts +++ b/packages/compiler/src/ml_parser/lexer.ts @@ -23,7 +23,6 @@ export enum TokenType { ESCAPABLE_RAW_TEXT, RAW_TEXT, INTERPOLATION, - ENCODED_ENTITY, COMMENT_START, COMMENT_END, CDATA_START, @@ -396,16 +395,19 @@ class _Tokenizer { } } - private _readChar(): string { - // Don't rely upon reading directly from `_input` as the actual char value - // may have been generated from an escape sequence. - const char = String.fromCodePoint(this._cursor.peek()); - this._cursor.advance(); - return char; + private _readChar(decodeEntities: boolean): string { + if (decodeEntities && this._cursor.peek() === chars.$AMPERSAND) { + return this._decodeEntity(); + } else { + // Don't rely upon reading directly from `_input` as the actual char value + // may have been generated from an escape sequence. + const char = String.fromCodePoint(this._cursor.peek()); + this._cursor.advance(); + return char; + } } - private _consumeEntity(textTokenType: TokenType): void { - this._beginToken(TokenType.ENCODED_ENTITY); + private _decodeEntity(): string { const start = this._cursor.clone(); this._cursor.advance(); if (this._attemptCharCode(chars.$HASH)) { @@ -425,7 +427,7 @@ class _Tokenizer { this._cursor.advance(); try { const charCode = parseInt(strNum, isHex ? 16 : 10); - this._endToken([String.fromCharCode(charCode), this._cursor.getChars(start)]); + return String.fromCharCode(charCode); } catch { throw this._createError( _unknownEntityErrorMsg(this._cursor.getChars(start)), this._cursor.getSpan()); @@ -434,25 +436,21 @@ class _Tokenizer { const nameStart = this._cursor.clone(); this._attemptCharCodeUntilFn(isNamedEntityEnd); if (this._cursor.peek() != chars.$SEMICOLON) { - // No semicolon was found so abort the encoded entity token that was in progress, and treat - // this as a text token - this._beginToken(textTokenType, start); this._cursor = nameStart; - this._endToken(['&']); - } else { - const name = this._cursor.getChars(nameStart); - this._cursor.advance(); - const char = NAMED_ENTITIES[name]; - if (!char) { - throw this._createError(_unknownEntityErrorMsg(name), this._cursor.getSpan(start)); - } - this._endToken([char, `&${name};`]); + return '&'; } + const name = this._cursor.getChars(nameStart); + this._cursor.advance(); + const char = NAMED_ENTITIES[name]; + if (!char) { + throw this._createError(_unknownEntityErrorMsg(name), this._cursor.getSpan(start)); + } + return char; } } - private _consumeRawText(consumeEntities: boolean, endMarkerPredicate: () => boolean): void { - this._beginToken(consumeEntities ? TokenType.ESCAPABLE_RAW_TEXT : TokenType.RAW_TEXT); + private _consumeRawText(decodeEntities: boolean, endMarkerPredicate: () => boolean): Token { + this._beginToken(decodeEntities ? TokenType.ESCAPABLE_RAW_TEXT : TokenType.RAW_TEXT); const parts: string[] = []; while (true) { const tagCloseStart = this._cursor.clone(); @@ -461,16 +459,9 @@ class _Tokenizer { if (foundEndMarker) { break; } - if (consumeEntities && this._cursor.peek() === chars.$AMPERSAND) { - this._endToken([this._processCarriageReturns(parts.join(''))]); - parts.length = 0; - this._consumeEntity(TokenType.ESCAPABLE_RAW_TEXT); - this._beginToken(TokenType.ESCAPABLE_RAW_TEXT); - } else { - parts.push(this._readChar()); - } + parts.push(this._readChar(decodeEntities)); } - this._endToken([this._processCarriageReturns(parts.join(''))]); + return this._endToken([this._processCarriageReturns(parts.join(''))]); } private _consumeComment(start: CharacterCursor) { @@ -572,8 +563,8 @@ class _Tokenizer { } } - private _consumeRawTextWithTagClose(prefix: string, tagName: string, consumeEntities: boolean) { - this._consumeRawText(consumeEntities, () => { + private _consumeRawTextWithTagClose(prefix: string, tagName: string, decodeEntities: boolean) { + this._consumeRawText(decodeEntities, () => { if (!this._attemptCharCode(chars.$LT)) return false; if (!this._attemptCharCode(chars.$SLASH)) return false; this._attemptCharCodeUntilFn(isNotWhitespace); @@ -721,16 +712,11 @@ class _Tokenizer { const current = this._cursor.clone(); if (this._interpolationConfig && this._attemptStr(this._interpolationConfig.start)) { this._endToken([this._processCarriageReturns(parts.join(''))], current); - parts.length = 0; this._consumeInterpolation(interpolationTokenType, current); - this._beginToken(textTokenType); - } else if (this._cursor.peek() === chars.$AMPERSAND) { - this._endToken([this._processCarriageReturns(parts.join(''))]); parts.length = 0; - this._consumeEntity(textTokenType); this._beginToken(textTokenType); } else { - parts.push(this._readChar()); + parts.push(this._readChar(true)); } } @@ -909,9 +895,7 @@ function mergeTextTokens(srcTokens: Token[]): Token[] { let lastDstToken: Token|undefined = undefined; for (let i = 0; i < srcTokens.length; i++) { const token = srcTokens[i]; - if ((lastDstToken && lastDstToken.type == TokenType.TEXT && token.type == TokenType.TEXT) || - (lastDstToken && lastDstToken.type == TokenType.ATTR_VALUE_TEXT && - token.type == TokenType.ATTR_VALUE_TEXT)) { + if (lastDstToken && lastDstToken.type == TokenType.TEXT && token.type == TokenType.TEXT) { lastDstToken.parts[0]! += token.parts[0]; lastDstToken.sourceSpan.end = token.sourceSpan.end; } else { diff --git a/packages/compiler/src/ml_parser/parser.ts b/packages/compiler/src/ml_parser/parser.ts index 4d5f18d440..9ac0b944e4 100644 --- a/packages/compiler/src/ml_parser/parser.ts +++ b/packages/compiler/src/ml_parser/parser.ts @@ -226,21 +226,20 @@ class _TreeBuilder { } } - // For now recombine text, interpolation and entity tokens - while (this._peek.type === lex.TokenType.INTERPOLATION || - this._peek.type === lex.TokenType.TEXT || - this._peek.type === lex.TokenType.ENCODED_ENTITY) { - token = this._advance(); - if (token.type === lex.TokenType.INTERPOLATION) { - // For backward compatibility we decode HTML entities that appear in interpolation - // expressions. This is arguably a bug, but it could be a considerable breaking change to - // fix it. It should be addressed in a larger project to refactor the entire parser/lexer - // chain after View Engine has been removed. - text += token.parts.join('').replace(/&([^;]+);/g, decodeEntity); - } else if (token.type === lex.TokenType.ENCODED_ENTITY) { - text += token.parts[0]; - } else { - text += token.parts.join(''); + // For now recombine text and interpolation tokens + if (this._peek.type === lex.TokenType.INTERPOLATION) { + while (this._peek.type === lex.TokenType.INTERPOLATION || + this._peek.type === lex.TokenType.TEXT) { + token = this._advance(); + if (token.type === lex.TokenType.INTERPOLATION) { + // For backward compatibility we decode HTML entities that appear in interpolation + // expressions. This is arguably a bug, but it could be a considerable breaking change to + // fix it. It should be addressed in a larger project to refactor the entire parser/lexer + // chain after View Engine has been removed. + text += token.parts.join('').replace(/&([^;]+);/g, decodeEntity); + } else { + text += token.parts.join(''); + } } } @@ -370,17 +369,16 @@ class _TreeBuilder { this._advance(); } - // Consume the attribute value + // Consume the value let value = ''; let valueStartSpan: ParseSourceSpan|undefined = undefined; let valueEnd: ParseLocation|undefined = undefined; if (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT) { valueStartSpan = this._peek.sourceSpan; valueEnd = this._peek.sourceSpan.end; - // For now recombine text, interpolation and entity tokens + // For now we are recombining text and interpolation tokens while (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT || - this._peek.type === lex.TokenType.ATTR_VALUE_INTERPOLATION || - this._peek.type === lex.TokenType.ENCODED_ENTITY) { + this._peek.type === lex.TokenType.ATTR_VALUE_INTERPOLATION) { let valueToken = this._advance(); if (valueToken.type === lex.TokenType.ATTR_VALUE_INTERPOLATION) { // For backward compatibility we decode HTML entities that appear in interpolation @@ -388,8 +386,6 @@ class _TreeBuilder { // fix it. It should be addressed in a larger project to refactor the entire parser/lexer // chain after View Engine has been removed. value += valueToken.parts.join('').replace(/&([^;]+);/g, decodeEntity); - } else if (valueToken.type === lex.TokenType.ENCODED_ENTITY) { - value += valueToken.parts[0]; } else { value += valueToken.parts.join(''); } diff --git a/packages/compiler/test/ml_parser/lexer_spec.ts b/packages/compiler/test/ml_parser/lexer_spec.ts index bc8559221b..835d59970d 100644 --- a/packages/compiler/test/ml_parser/lexer_spec.ts +++ b/packages/compiler/test/ml_parser/lexer_spec.ts @@ -407,11 +407,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u [lex.TokenType.TAG_OPEN_START, '', 't'], [lex.TokenType.ATTR_NAME, '', 'a'], [lex.TokenType.ATTR_QUOTE, '"'], - [lex.TokenType.ATTR_VALUE_TEXT, ''], - [lex.TokenType.ENCODED_ENTITY, 'A', 'A'], - [lex.TokenType.ATTR_VALUE_TEXT, ''], - [lex.TokenType.ENCODED_ENTITY, 'A', 'A'], - [lex.TokenType.ATTR_VALUE_TEXT, ''], + [lex.TokenType.ATTR_VALUE_TEXT, 'AA'], [lex.TokenType.ATTR_QUOTE, '"'], [lex.TokenType.TAG_OPEN_END], [lex.TokenType.EOF], @@ -526,60 +522,50 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u describe('entities', () => { it('should parse named entities', () => { expect(tokenizeAndHumanizeParts('a&b')).toEqual([ - [lex.TokenType.TEXT, 'a'], - [lex.TokenType.ENCODED_ENTITY, '&', '&'], - [lex.TokenType.TEXT, 'b'], + [lex.TokenType.TEXT, 'a&b'], [lex.TokenType.EOF], ]); }); it('should parse hexadecimal entities', () => { expect(tokenizeAndHumanizeParts('AA')).toEqual([ - [lex.TokenType.TEXT, ''], - [lex.TokenType.ENCODED_ENTITY, 'A', 'A'], - [lex.TokenType.TEXT, ''], - [lex.TokenType.ENCODED_ENTITY, 'A', 'A'], - [lex.TokenType.TEXT, ''], + [lex.TokenType.TEXT, 'AA'], [lex.TokenType.EOF], ]); }); it('should parse decimal entities', () => { expect(tokenizeAndHumanizeParts('A')).toEqual([ - [lex.TokenType.TEXT, ''], - [lex.TokenType.ENCODED_ENTITY, 'A', 'A'], - [lex.TokenType.TEXT, ''], + [lex.TokenType.TEXT, 'A'], [lex.TokenType.EOF], ]); }); it('should store the locations', () => { expect(tokenizeAndHumanizeSourceSpans('a&b')).toEqual([ - [lex.TokenType.TEXT, 'a'], - [lex.TokenType.ENCODED_ENTITY, '&'], - [lex.TokenType.TEXT, 'b'], + [lex.TokenType.TEXT, 'a&b'], [lex.TokenType.EOF, ''], ]); }); it('should report malformed/unknown entities', () => { expect(tokenizeAndHumanizeErrors('&tbo;')).toEqual([[ - lex.TokenType.ENCODED_ENTITY, + lex.TokenType.TEXT, 'Unknown entity "tbo" - use the "&#;" or "&#x;" syntax', '0:0' ]]); expect(tokenizeAndHumanizeErrors('sdf;')).toEqual([[ - lex.TokenType.ENCODED_ENTITY, + lex.TokenType.TEXT, 'Unable to parse entity "s" - decimal character reference entities must end with ";"', '0:4' ]]); expect(tokenizeAndHumanizeErrors(' sdf;')).toEqual([[ - lex.TokenType.ENCODED_ENTITY, + lex.TokenType.TEXT, 'Unable to parse entity " s" - hexadecimal character reference entities must end with ";"', '0:5' ]]); expect(tokenizeAndHumanizeErrors('઼')).toEqual([ - [lex.TokenType.ENCODED_ENTITY, 'Unexpected character "EOF"', '0:6'] + [lex.TokenType.TEXT, 'Unexpected character "EOF"', '0:6'] ]); }); }); @@ -657,16 +643,12 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u it('should parse entities', () => { expect(tokenizeAndHumanizeParts('a&b')).toEqual([ - [lex.TokenType.TEXT, 'a'], - [lex.TokenType.ENCODED_ENTITY, '&', '&'], - [lex.TokenType.TEXT, 'b'], + [lex.TokenType.TEXT, 'a&b'], [lex.TokenType.EOF], ]); expect(tokenizeAndHumanizeSourceSpans('a&b')).toEqual([ - [lex.TokenType.TEXT, 'a'], - [lex.TokenType.ENCODED_ENTITY, '&'], - [lex.TokenType.TEXT, 'b'], + [lex.TokenType.TEXT, 'a&b'], [lex.TokenType.EOF, ''], ]); }); @@ -912,9 +894,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u expect(tokenizeAndHumanizeParts(`&`)).toEqual([ [lex.TokenType.TAG_OPEN_START, '', 'title'], [lex.TokenType.TAG_OPEN_END], - [lex.TokenType.ESCAPABLE_RAW_TEXT, ''], - [lex.TokenType.ENCODED_ENTITY, '&', '&'], - [lex.TokenType.ESCAPABLE_RAW_TEXT, ''], + [lex.TokenType.ESCAPABLE_RAW_TEXT, '&'], [lex.TokenType.TAG_CLOSE, '', 'title'], [lex.TokenType.EOF], ]);