Revert "refactor(compiler): support encoded entity tokens when lexing markup (#42062)" (#43033)

This reverts commit 942b24d5ea.

PR Close #43033
This commit is contained in:
atscott 2021-08-03 14:49:05 -07:00
parent ea5ed4e4d4
commit 8d8ab4775c
3 changed files with 57 additions and 97 deletions

View File

@ -23,7 +23,6 @@ export enum TokenType {
ESCAPABLE_RAW_TEXT, ESCAPABLE_RAW_TEXT,
RAW_TEXT, RAW_TEXT,
INTERPOLATION, INTERPOLATION,
ENCODED_ENTITY,
COMMENT_START, COMMENT_START,
COMMENT_END, COMMENT_END,
CDATA_START, CDATA_START,
@ -396,16 +395,19 @@ class _Tokenizer {
} }
} }
private _readChar(): string { private _readChar(decodeEntities: boolean): string {
if (decodeEntities && this._cursor.peek() === chars.$AMPERSAND) {
return this._decodeEntity();
} else {
// Don't rely upon reading directly from `_input` as the actual char value // Don't rely upon reading directly from `_input` as the actual char value
// may have been generated from an escape sequence. // may have been generated from an escape sequence.
const char = String.fromCodePoint(this._cursor.peek()); const char = String.fromCodePoint(this._cursor.peek());
this._cursor.advance(); this._cursor.advance();
return char; return char;
} }
}
private _consumeEntity(textTokenType: TokenType): void { private _decodeEntity(): string {
this._beginToken(TokenType.ENCODED_ENTITY);
const start = this._cursor.clone(); const start = this._cursor.clone();
this._cursor.advance(); this._cursor.advance();
if (this._attemptCharCode(chars.$HASH)) { if (this._attemptCharCode(chars.$HASH)) {
@ -425,7 +427,7 @@ class _Tokenizer {
this._cursor.advance(); this._cursor.advance();
try { try {
const charCode = parseInt(strNum, isHex ? 16 : 10); const charCode = parseInt(strNum, isHex ? 16 : 10);
this._endToken([String.fromCharCode(charCode), this._cursor.getChars(start)]); return String.fromCharCode(charCode);
} catch { } catch {
throw this._createError( throw this._createError(
_unknownEntityErrorMsg(this._cursor.getChars(start)), this._cursor.getSpan()); _unknownEntityErrorMsg(this._cursor.getChars(start)), this._cursor.getSpan());
@ -434,25 +436,21 @@ class _Tokenizer {
const nameStart = this._cursor.clone(); const nameStart = this._cursor.clone();
this._attemptCharCodeUntilFn(isNamedEntityEnd); this._attemptCharCodeUntilFn(isNamedEntityEnd);
if (this._cursor.peek() != chars.$SEMICOLON) { if (this._cursor.peek() != chars.$SEMICOLON) {
// No semicolon was found so abort the encoded entity token that was in progress, and treat
// this as a text token
this._beginToken(textTokenType, start);
this._cursor = nameStart; this._cursor = nameStart;
this._endToken(['&']); return '&';
} else { }
const name = this._cursor.getChars(nameStart); const name = this._cursor.getChars(nameStart);
this._cursor.advance(); this._cursor.advance();
const char = NAMED_ENTITIES[name]; const char = NAMED_ENTITIES[name];
if (!char) { if (!char) {
throw this._createError(_unknownEntityErrorMsg(name), this._cursor.getSpan(start)); throw this._createError(_unknownEntityErrorMsg(name), this._cursor.getSpan(start));
} }
this._endToken([char, `&${name};`]); return char;
}
} }
} }
private _consumeRawText(consumeEntities: boolean, endMarkerPredicate: () => boolean): void { private _consumeRawText(decodeEntities: boolean, endMarkerPredicate: () => boolean): Token {
this._beginToken(consumeEntities ? TokenType.ESCAPABLE_RAW_TEXT : TokenType.RAW_TEXT); this._beginToken(decodeEntities ? TokenType.ESCAPABLE_RAW_TEXT : TokenType.RAW_TEXT);
const parts: string[] = []; const parts: string[] = [];
while (true) { while (true) {
const tagCloseStart = this._cursor.clone(); const tagCloseStart = this._cursor.clone();
@ -461,16 +459,9 @@ class _Tokenizer {
if (foundEndMarker) { if (foundEndMarker) {
break; break;
} }
if (consumeEntities && this._cursor.peek() === chars.$AMPERSAND) { parts.push(this._readChar(decodeEntities));
this._endToken([this._processCarriageReturns(parts.join(''))]);
parts.length = 0;
this._consumeEntity(TokenType.ESCAPABLE_RAW_TEXT);
this._beginToken(TokenType.ESCAPABLE_RAW_TEXT);
} else {
parts.push(this._readChar());
} }
} return this._endToken([this._processCarriageReturns(parts.join(''))]);
this._endToken([this._processCarriageReturns(parts.join(''))]);
} }
private _consumeComment(start: CharacterCursor) { private _consumeComment(start: CharacterCursor) {
@ -572,8 +563,8 @@ class _Tokenizer {
} }
} }
private _consumeRawTextWithTagClose(prefix: string, tagName: string, consumeEntities: boolean) { private _consumeRawTextWithTagClose(prefix: string, tagName: string, decodeEntities: boolean) {
this._consumeRawText(consumeEntities, () => { this._consumeRawText(decodeEntities, () => {
if (!this._attemptCharCode(chars.$LT)) return false; if (!this._attemptCharCode(chars.$LT)) return false;
if (!this._attemptCharCode(chars.$SLASH)) return false; if (!this._attemptCharCode(chars.$SLASH)) return false;
this._attemptCharCodeUntilFn(isNotWhitespace); this._attemptCharCodeUntilFn(isNotWhitespace);
@ -721,16 +712,11 @@ class _Tokenizer {
const current = this._cursor.clone(); const current = this._cursor.clone();
if (this._interpolationConfig && this._attemptStr(this._interpolationConfig.start)) { if (this._interpolationConfig && this._attemptStr(this._interpolationConfig.start)) {
this._endToken([this._processCarriageReturns(parts.join(''))], current); this._endToken([this._processCarriageReturns(parts.join(''))], current);
parts.length = 0;
this._consumeInterpolation(interpolationTokenType, current); this._consumeInterpolation(interpolationTokenType, current);
this._beginToken(textTokenType);
} else if (this._cursor.peek() === chars.$AMPERSAND) {
this._endToken([this._processCarriageReturns(parts.join(''))]);
parts.length = 0; parts.length = 0;
this._consumeEntity(textTokenType);
this._beginToken(textTokenType); this._beginToken(textTokenType);
} else { } else {
parts.push(this._readChar()); parts.push(this._readChar(true));
} }
} }
@ -909,9 +895,7 @@ function mergeTextTokens(srcTokens: Token[]): Token[] {
let lastDstToken: Token|undefined = undefined; let lastDstToken: Token|undefined = undefined;
for (let i = 0; i < srcTokens.length; i++) { for (let i = 0; i < srcTokens.length; i++) {
const token = srcTokens[i]; const token = srcTokens[i];
if ((lastDstToken && lastDstToken.type == TokenType.TEXT && token.type == TokenType.TEXT) || if (lastDstToken && lastDstToken.type == TokenType.TEXT && token.type == TokenType.TEXT) {
(lastDstToken && lastDstToken.type == TokenType.ATTR_VALUE_TEXT &&
token.type == TokenType.ATTR_VALUE_TEXT)) {
lastDstToken.parts[0]! += token.parts[0]; lastDstToken.parts[0]! += token.parts[0];
lastDstToken.sourceSpan.end = token.sourceSpan.end; lastDstToken.sourceSpan.end = token.sourceSpan.end;
} else { } else {

View File

@ -226,10 +226,10 @@ class _TreeBuilder {
} }
} }
// For now recombine text, interpolation and entity tokens // For now recombine text and interpolation tokens
if (this._peek.type === lex.TokenType.INTERPOLATION) {
while (this._peek.type === lex.TokenType.INTERPOLATION || while (this._peek.type === lex.TokenType.INTERPOLATION ||
this._peek.type === lex.TokenType.TEXT || this._peek.type === lex.TokenType.TEXT) {
this._peek.type === lex.TokenType.ENCODED_ENTITY) {
token = this._advance(); token = this._advance();
if (token.type === lex.TokenType.INTERPOLATION) { if (token.type === lex.TokenType.INTERPOLATION) {
// For backward compatibility we decode HTML entities that appear in interpolation // For backward compatibility we decode HTML entities that appear in interpolation
@ -237,12 +237,11 @@ class _TreeBuilder {
// fix it. It should be addressed in a larger project to refactor the entire parser/lexer // fix it. It should be addressed in a larger project to refactor the entire parser/lexer
// chain after View Engine has been removed. // chain after View Engine has been removed.
text += token.parts.join('').replace(/&([^;]+);/g, decodeEntity); text += token.parts.join('').replace(/&([^;]+);/g, decodeEntity);
} else if (token.type === lex.TokenType.ENCODED_ENTITY) {
text += token.parts[0];
} else { } else {
text += token.parts.join(''); text += token.parts.join('');
} }
} }
}
if (text.length > 0) { if (text.length > 0) {
const endSpan = token.sourceSpan; const endSpan = token.sourceSpan;
@ -370,17 +369,16 @@ class _TreeBuilder {
this._advance(); this._advance();
} }
// Consume the attribute value // Consume the value
let value = ''; let value = '';
let valueStartSpan: ParseSourceSpan|undefined = undefined; let valueStartSpan: ParseSourceSpan|undefined = undefined;
let valueEnd: ParseLocation|undefined = undefined; let valueEnd: ParseLocation|undefined = undefined;
if (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT) { if (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT) {
valueStartSpan = this._peek.sourceSpan; valueStartSpan = this._peek.sourceSpan;
valueEnd = this._peek.sourceSpan.end; valueEnd = this._peek.sourceSpan.end;
// For now recombine text, interpolation and entity tokens // For now we are recombining text and interpolation tokens
while (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT || while (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT ||
this._peek.type === lex.TokenType.ATTR_VALUE_INTERPOLATION || this._peek.type === lex.TokenType.ATTR_VALUE_INTERPOLATION) {
this._peek.type === lex.TokenType.ENCODED_ENTITY) {
let valueToken = this._advance(); let valueToken = this._advance();
if (valueToken.type === lex.TokenType.ATTR_VALUE_INTERPOLATION) { if (valueToken.type === lex.TokenType.ATTR_VALUE_INTERPOLATION) {
// For backward compatibility we decode HTML entities that appear in interpolation // For backward compatibility we decode HTML entities that appear in interpolation
@ -388,8 +386,6 @@ class _TreeBuilder {
// fix it. It should be addressed in a larger project to refactor the entire parser/lexer // fix it. It should be addressed in a larger project to refactor the entire parser/lexer
// chain after View Engine has been removed. // chain after View Engine has been removed.
value += valueToken.parts.join('').replace(/&([^;]+);/g, decodeEntity); value += valueToken.parts.join('').replace(/&([^;]+);/g, decodeEntity);
} else if (valueToken.type === lex.TokenType.ENCODED_ENTITY) {
value += valueToken.parts[0];
} else { } else {
value += valueToken.parts.join(''); value += valueToken.parts.join('');
} }

View File

@ -407,11 +407,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
[lex.TokenType.TAG_OPEN_START, '', 't'], [lex.TokenType.TAG_OPEN_START, '', 't'],
[lex.TokenType.ATTR_NAME, '', 'a'], [lex.TokenType.ATTR_NAME, '', 'a'],
[lex.TokenType.ATTR_QUOTE, '"'], [lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.ATTR_VALUE_TEXT, ''], [lex.TokenType.ATTR_VALUE_TEXT, 'AA'],
[lex.TokenType.ENCODED_ENTITY, 'A', '&#65;'],
[lex.TokenType.ATTR_VALUE_TEXT, ''],
[lex.TokenType.ENCODED_ENTITY, 'A', '&#x41;'],
[lex.TokenType.ATTR_VALUE_TEXT, ''],
[lex.TokenType.ATTR_QUOTE, '"'], [lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.TAG_OPEN_END], [lex.TokenType.TAG_OPEN_END],
[lex.TokenType.EOF], [lex.TokenType.EOF],
@ -526,60 +522,50 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
describe('entities', () => { describe('entities', () => {
it('should parse named entities', () => { it('should parse named entities', () => {
expect(tokenizeAndHumanizeParts('a&amp;b')).toEqual([ expect(tokenizeAndHumanizeParts('a&amp;b')).toEqual([
[lex.TokenType.TEXT, 'a'], [lex.TokenType.TEXT, 'a&b'],
[lex.TokenType.ENCODED_ENTITY, '&', '&amp;'],
[lex.TokenType.TEXT, 'b'],
[lex.TokenType.EOF], [lex.TokenType.EOF],
]); ]);
}); });
it('should parse hexadecimal entities', () => { it('should parse hexadecimal entities', () => {
expect(tokenizeAndHumanizeParts('&#x41;&#X41;')).toEqual([ expect(tokenizeAndHumanizeParts('&#x41;&#X41;')).toEqual([
[lex.TokenType.TEXT, ''], [lex.TokenType.TEXT, 'AA'],
[lex.TokenType.ENCODED_ENTITY, 'A', '&#x41;'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.ENCODED_ENTITY, 'A', '&#X41;'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.EOF], [lex.TokenType.EOF],
]); ]);
}); });
it('should parse decimal entities', () => { it('should parse decimal entities', () => {
expect(tokenizeAndHumanizeParts('&#65;')).toEqual([ expect(tokenizeAndHumanizeParts('&#65;')).toEqual([
[lex.TokenType.TEXT, ''], [lex.TokenType.TEXT, 'A'],
[lex.TokenType.ENCODED_ENTITY, 'A', '&#65;'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.EOF], [lex.TokenType.EOF],
]); ]);
}); });
it('should store the locations', () => { it('should store the locations', () => {
expect(tokenizeAndHumanizeSourceSpans('a&amp;b')).toEqual([ expect(tokenizeAndHumanizeSourceSpans('a&amp;b')).toEqual([
[lex.TokenType.TEXT, 'a'], [lex.TokenType.TEXT, 'a&amp;b'],
[lex.TokenType.ENCODED_ENTITY, '&amp;'],
[lex.TokenType.TEXT, 'b'],
[lex.TokenType.EOF, ''], [lex.TokenType.EOF, ''],
]); ]);
}); });
it('should report malformed/unknown entities', () => { it('should report malformed/unknown entities', () => {
expect(tokenizeAndHumanizeErrors('&tbo;')).toEqual([[ expect(tokenizeAndHumanizeErrors('&tbo;')).toEqual([[
lex.TokenType.ENCODED_ENTITY, lex.TokenType.TEXT,
'Unknown entity "tbo" - use the "&#<decimal>;" or "&#x<hex>;" syntax', '0:0' 'Unknown entity "tbo" - use the "&#<decimal>;" or "&#x<hex>;" syntax', '0:0'
]]); ]]);
expect(tokenizeAndHumanizeErrors('&#3sdf;')).toEqual([[ expect(tokenizeAndHumanizeErrors('&#3sdf;')).toEqual([[
lex.TokenType.ENCODED_ENTITY, lex.TokenType.TEXT,
'Unable to parse entity "&#3s" - decimal character reference entities must end with ";"', 'Unable to parse entity "&#3s" - decimal character reference entities must end with ";"',
'0:4' '0:4'
]]); ]]);
expect(tokenizeAndHumanizeErrors('&#xasdf;')).toEqual([[ expect(tokenizeAndHumanizeErrors('&#xasdf;')).toEqual([[
lex.TokenType.ENCODED_ENTITY, lex.TokenType.TEXT,
'Unable to parse entity "&#xas" - hexadecimal character reference entities must end with ";"', 'Unable to parse entity "&#xas" - hexadecimal character reference entities must end with ";"',
'0:5' '0:5'
]]); ]]);
expect(tokenizeAndHumanizeErrors('&#xABC')).toEqual([ expect(tokenizeAndHumanizeErrors('&#xABC')).toEqual([
[lex.TokenType.ENCODED_ENTITY, 'Unexpected character "EOF"', '0:6'] [lex.TokenType.TEXT, 'Unexpected character "EOF"', '0:6']
]); ]);
}); });
}); });
@ -657,16 +643,12 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
it('should parse entities', () => { it('should parse entities', () => {
expect(tokenizeAndHumanizeParts('a&amp;b')).toEqual([ expect(tokenizeAndHumanizeParts('a&amp;b')).toEqual([
[lex.TokenType.TEXT, 'a'], [lex.TokenType.TEXT, 'a&b'],
[lex.TokenType.ENCODED_ENTITY, '&', '&amp;'],
[lex.TokenType.TEXT, 'b'],
[lex.TokenType.EOF], [lex.TokenType.EOF],
]); ]);
expect(tokenizeAndHumanizeSourceSpans('a&amp;b')).toEqual([ expect(tokenizeAndHumanizeSourceSpans('a&amp;b')).toEqual([
[lex.TokenType.TEXT, 'a'], [lex.TokenType.TEXT, 'a&amp;b'],
[lex.TokenType.ENCODED_ENTITY, '&amp;'],
[lex.TokenType.TEXT, 'b'],
[lex.TokenType.EOF, ''], [lex.TokenType.EOF, ''],
]); ]);
}); });
@ -912,9 +894,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
expect(tokenizeAndHumanizeParts(`<title>&amp;</title>`)).toEqual([ expect(tokenizeAndHumanizeParts(`<title>&amp;</title>`)).toEqual([
[lex.TokenType.TAG_OPEN_START, '', 'title'], [lex.TokenType.TAG_OPEN_START, '', 'title'],
[lex.TokenType.TAG_OPEN_END], [lex.TokenType.TAG_OPEN_END],
[lex.TokenType.ESCAPABLE_RAW_TEXT, ''], [lex.TokenType.ESCAPABLE_RAW_TEXT, '&'],
[lex.TokenType.ENCODED_ENTITY, '&', '&amp;'],
[lex.TokenType.ESCAPABLE_RAW_TEXT, ''],
[lex.TokenType.TAG_CLOSE, '', 'title'], [lex.TokenType.TAG_CLOSE, '', 'title'],
[lex.TokenType.EOF], [lex.TokenType.EOF],
]); ]);