Revert "refactor(compiler): support encoded entity tokens when lexing markup (#42062)" (#43033)

This reverts commit 942b24d5ea.

PR Close #43033
This commit is contained in:
atscott 2021-08-03 14:49:05 -07:00
parent ea5ed4e4d4
commit 8d8ab4775c
3 changed files with 57 additions and 97 deletions

View File

@ -23,7 +23,6 @@ export enum TokenType {
ESCAPABLE_RAW_TEXT,
RAW_TEXT,
INTERPOLATION,
ENCODED_ENTITY,
COMMENT_START,
COMMENT_END,
CDATA_START,
@ -396,16 +395,19 @@ class _Tokenizer {
}
}
private _readChar(): string {
// Don't rely upon reading directly from `_input` as the actual char value
// may have been generated from an escape sequence.
const char = String.fromCodePoint(this._cursor.peek());
this._cursor.advance();
return char;
private _readChar(decodeEntities: boolean): string {
if (decodeEntities && this._cursor.peek() === chars.$AMPERSAND) {
return this._decodeEntity();
} else {
// Don't rely upon reading directly from `_input` as the actual char value
// may have been generated from an escape sequence.
const char = String.fromCodePoint(this._cursor.peek());
this._cursor.advance();
return char;
}
}
private _consumeEntity(textTokenType: TokenType): void {
this._beginToken(TokenType.ENCODED_ENTITY);
private _decodeEntity(): string {
const start = this._cursor.clone();
this._cursor.advance();
if (this._attemptCharCode(chars.$HASH)) {
@ -425,7 +427,7 @@ class _Tokenizer {
this._cursor.advance();
try {
const charCode = parseInt(strNum, isHex ? 16 : 10);
this._endToken([String.fromCharCode(charCode), this._cursor.getChars(start)]);
return String.fromCharCode(charCode);
} catch {
throw this._createError(
_unknownEntityErrorMsg(this._cursor.getChars(start)), this._cursor.getSpan());
@ -434,25 +436,21 @@ class _Tokenizer {
const nameStart = this._cursor.clone();
this._attemptCharCodeUntilFn(isNamedEntityEnd);
if (this._cursor.peek() != chars.$SEMICOLON) {
// No semicolon was found so abort the encoded entity token that was in progress, and treat
// this as a text token
this._beginToken(textTokenType, start);
this._cursor = nameStart;
this._endToken(['&']);
} else {
const name = this._cursor.getChars(nameStart);
this._cursor.advance();
const char = NAMED_ENTITIES[name];
if (!char) {
throw this._createError(_unknownEntityErrorMsg(name), this._cursor.getSpan(start));
}
this._endToken([char, `&${name};`]);
return '&';
}
const name = this._cursor.getChars(nameStart);
this._cursor.advance();
const char = NAMED_ENTITIES[name];
if (!char) {
throw this._createError(_unknownEntityErrorMsg(name), this._cursor.getSpan(start));
}
return char;
}
}
private _consumeRawText(consumeEntities: boolean, endMarkerPredicate: () => boolean): void {
this._beginToken(consumeEntities ? TokenType.ESCAPABLE_RAW_TEXT : TokenType.RAW_TEXT);
private _consumeRawText(decodeEntities: boolean, endMarkerPredicate: () => boolean): Token {
this._beginToken(decodeEntities ? TokenType.ESCAPABLE_RAW_TEXT : TokenType.RAW_TEXT);
const parts: string[] = [];
while (true) {
const tagCloseStart = this._cursor.clone();
@ -461,16 +459,9 @@ class _Tokenizer {
if (foundEndMarker) {
break;
}
if (consumeEntities && this._cursor.peek() === chars.$AMPERSAND) {
this._endToken([this._processCarriageReturns(parts.join(''))]);
parts.length = 0;
this._consumeEntity(TokenType.ESCAPABLE_RAW_TEXT);
this._beginToken(TokenType.ESCAPABLE_RAW_TEXT);
} else {
parts.push(this._readChar());
}
parts.push(this._readChar(decodeEntities));
}
this._endToken([this._processCarriageReturns(parts.join(''))]);
return this._endToken([this._processCarriageReturns(parts.join(''))]);
}
private _consumeComment(start: CharacterCursor) {
@ -572,8 +563,8 @@ class _Tokenizer {
}
}
private _consumeRawTextWithTagClose(prefix: string, tagName: string, consumeEntities: boolean) {
this._consumeRawText(consumeEntities, () => {
private _consumeRawTextWithTagClose(prefix: string, tagName: string, decodeEntities: boolean) {
this._consumeRawText(decodeEntities, () => {
if (!this._attemptCharCode(chars.$LT)) return false;
if (!this._attemptCharCode(chars.$SLASH)) return false;
this._attemptCharCodeUntilFn(isNotWhitespace);
@ -721,16 +712,11 @@ class _Tokenizer {
const current = this._cursor.clone();
if (this._interpolationConfig && this._attemptStr(this._interpolationConfig.start)) {
this._endToken([this._processCarriageReturns(parts.join(''))], current);
parts.length = 0;
this._consumeInterpolation(interpolationTokenType, current);
this._beginToken(textTokenType);
} else if (this._cursor.peek() === chars.$AMPERSAND) {
this._endToken([this._processCarriageReturns(parts.join(''))]);
parts.length = 0;
this._consumeEntity(textTokenType);
this._beginToken(textTokenType);
} else {
parts.push(this._readChar());
parts.push(this._readChar(true));
}
}
@ -909,9 +895,7 @@ function mergeTextTokens(srcTokens: Token[]): Token[] {
let lastDstToken: Token|undefined = undefined;
for (let i = 0; i < srcTokens.length; i++) {
const token = srcTokens[i];
if ((lastDstToken && lastDstToken.type == TokenType.TEXT && token.type == TokenType.TEXT) ||
(lastDstToken && lastDstToken.type == TokenType.ATTR_VALUE_TEXT &&
token.type == TokenType.ATTR_VALUE_TEXT)) {
if (lastDstToken && lastDstToken.type == TokenType.TEXT && token.type == TokenType.TEXT) {
lastDstToken.parts[0]! += token.parts[0];
lastDstToken.sourceSpan.end = token.sourceSpan.end;
} else {

View File

@ -226,21 +226,20 @@ class _TreeBuilder {
}
}
// For now recombine text, interpolation and entity tokens
while (this._peek.type === lex.TokenType.INTERPOLATION ||
this._peek.type === lex.TokenType.TEXT ||
this._peek.type === lex.TokenType.ENCODED_ENTITY) {
token = this._advance();
if (token.type === lex.TokenType.INTERPOLATION) {
// For backward compatibility we decode HTML entities that appear in interpolation
// expressions. This is arguably a bug, but it could be a considerable breaking change to
// fix it. It should be addressed in a larger project to refactor the entire parser/lexer
// chain after View Engine has been removed.
text += token.parts.join('').replace(/&([^;]+);/g, decodeEntity);
} else if (token.type === lex.TokenType.ENCODED_ENTITY) {
text += token.parts[0];
} else {
text += token.parts.join('');
// For now recombine text and interpolation tokens
if (this._peek.type === lex.TokenType.INTERPOLATION) {
while (this._peek.type === lex.TokenType.INTERPOLATION ||
this._peek.type === lex.TokenType.TEXT) {
token = this._advance();
if (token.type === lex.TokenType.INTERPOLATION) {
// For backward compatibility we decode HTML entities that appear in interpolation
// expressions. This is arguably a bug, but it could be a considerable breaking change to
// fix it. It should be addressed in a larger project to refactor the entire parser/lexer
// chain after View Engine has been removed.
text += token.parts.join('').replace(/&([^;]+);/g, decodeEntity);
} else {
text += token.parts.join('');
}
}
}
@ -370,17 +369,16 @@ class _TreeBuilder {
this._advance();
}
// Consume the attribute value
// Consume the value
let value = '';
let valueStartSpan: ParseSourceSpan|undefined = undefined;
let valueEnd: ParseLocation|undefined = undefined;
if (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT) {
valueStartSpan = this._peek.sourceSpan;
valueEnd = this._peek.sourceSpan.end;
// For now recombine text, interpolation and entity tokens
// For now we are recombining text and interpolation tokens
while (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT ||
this._peek.type === lex.TokenType.ATTR_VALUE_INTERPOLATION ||
this._peek.type === lex.TokenType.ENCODED_ENTITY) {
this._peek.type === lex.TokenType.ATTR_VALUE_INTERPOLATION) {
let valueToken = this._advance();
if (valueToken.type === lex.TokenType.ATTR_VALUE_INTERPOLATION) {
// For backward compatibility we decode HTML entities that appear in interpolation
@ -388,8 +386,6 @@ class _TreeBuilder {
// fix it. It should be addressed in a larger project to refactor the entire parser/lexer
// chain after View Engine has been removed.
value += valueToken.parts.join('').replace(/&([^;]+);/g, decodeEntity);
} else if (valueToken.type === lex.TokenType.ENCODED_ENTITY) {
value += valueToken.parts[0];
} else {
value += valueToken.parts.join('');
}

View File

@ -407,11 +407,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
[lex.TokenType.TAG_OPEN_START, '', 't'],
[lex.TokenType.ATTR_NAME, '', 'a'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.ATTR_VALUE_TEXT, ''],
[lex.TokenType.ENCODED_ENTITY, 'A', '&#65;'],
[lex.TokenType.ATTR_VALUE_TEXT, ''],
[lex.TokenType.ENCODED_ENTITY, 'A', '&#x41;'],
[lex.TokenType.ATTR_VALUE_TEXT, ''],
[lex.TokenType.ATTR_VALUE_TEXT, 'AA'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.TAG_OPEN_END],
[lex.TokenType.EOF],
@ -526,60 +522,50 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
describe('entities', () => {
it('should parse named entities', () => {
expect(tokenizeAndHumanizeParts('a&amp;b')).toEqual([
[lex.TokenType.TEXT, 'a'],
[lex.TokenType.ENCODED_ENTITY, '&', '&amp;'],
[lex.TokenType.TEXT, 'b'],
[lex.TokenType.TEXT, 'a&b'],
[lex.TokenType.EOF],
]);
});
it('should parse hexadecimal entities', () => {
expect(tokenizeAndHumanizeParts('&#x41;&#X41;')).toEqual([
[lex.TokenType.TEXT, ''],
[lex.TokenType.ENCODED_ENTITY, 'A', '&#x41;'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.ENCODED_ENTITY, 'A', '&#X41;'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.TEXT, 'AA'],
[lex.TokenType.EOF],
]);
});
it('should parse decimal entities', () => {
expect(tokenizeAndHumanizeParts('&#65;')).toEqual([
[lex.TokenType.TEXT, ''],
[lex.TokenType.ENCODED_ENTITY, 'A', '&#65;'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.TEXT, 'A'],
[lex.TokenType.EOF],
]);
});
it('should store the locations', () => {
expect(tokenizeAndHumanizeSourceSpans('a&amp;b')).toEqual([
[lex.TokenType.TEXT, 'a'],
[lex.TokenType.ENCODED_ENTITY, '&amp;'],
[lex.TokenType.TEXT, 'b'],
[lex.TokenType.TEXT, 'a&amp;b'],
[lex.TokenType.EOF, ''],
]);
});
it('should report malformed/unknown entities', () => {
expect(tokenizeAndHumanizeErrors('&tbo;')).toEqual([[
lex.TokenType.ENCODED_ENTITY,
lex.TokenType.TEXT,
'Unknown entity "tbo" - use the "&#<decimal>;" or "&#x<hex>;" syntax', '0:0'
]]);
expect(tokenizeAndHumanizeErrors('&#3sdf;')).toEqual([[
lex.TokenType.ENCODED_ENTITY,
lex.TokenType.TEXT,
'Unable to parse entity "&#3s" - decimal character reference entities must end with ";"',
'0:4'
]]);
expect(tokenizeAndHumanizeErrors('&#xasdf;')).toEqual([[
lex.TokenType.ENCODED_ENTITY,
lex.TokenType.TEXT,
'Unable to parse entity "&#xas" - hexadecimal character reference entities must end with ";"',
'0:5'
]]);
expect(tokenizeAndHumanizeErrors('&#xABC')).toEqual([
[lex.TokenType.ENCODED_ENTITY, 'Unexpected character "EOF"', '0:6']
[lex.TokenType.TEXT, 'Unexpected character "EOF"', '0:6']
]);
});
});
@ -657,16 +643,12 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
it('should parse entities', () => {
expect(tokenizeAndHumanizeParts('a&amp;b')).toEqual([
[lex.TokenType.TEXT, 'a'],
[lex.TokenType.ENCODED_ENTITY, '&', '&amp;'],
[lex.TokenType.TEXT, 'b'],
[lex.TokenType.TEXT, 'a&b'],
[lex.TokenType.EOF],
]);
expect(tokenizeAndHumanizeSourceSpans('a&amp;b')).toEqual([
[lex.TokenType.TEXT, 'a'],
[lex.TokenType.ENCODED_ENTITY, '&amp;'],
[lex.TokenType.TEXT, 'b'],
[lex.TokenType.TEXT, 'a&amp;b'],
[lex.TokenType.EOF, ''],
]);
});
@ -912,9 +894,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
expect(tokenizeAndHumanizeParts(`<title>&amp;</title>`)).toEqual([
[lex.TokenType.TAG_OPEN_START, '', 'title'],
[lex.TokenType.TAG_OPEN_END],
[lex.TokenType.ESCAPABLE_RAW_TEXT, ''],
[lex.TokenType.ENCODED_ENTITY, '&', '&amp;'],
[lex.TokenType.ESCAPABLE_RAW_TEXT, ''],
[lex.TokenType.ESCAPABLE_RAW_TEXT, '&'],
[lex.TokenType.TAG_CLOSE, '', 'title'],
[lex.TokenType.EOF],
]);