refactor(compiler): support encoded entity tokens when lexing markup (#42062)

The lexer now splits encoded entity tokens out from text and attribute value tokens.

Previously encoded entities would be decoded and the decoded value would be
included as part of the text token of the surrounding text. Now the entities
have their own tokens. There are two scenarios: text and attribute values.

Previously the contents of `<div>Hello &amp; goodbye</div>` would be a single
TEXT token. Now it will be three tokens:

```
TEXT: "Hello "
ENCODED_ENTITY: "&", "&amp;"
TEXT: " goodbye"
```

Previously the attribute value in `<div title="Hello &amp; goodbye">` would be
a single text token. Now it will be three tokens:

```
ATTR_VALUE_TEXT: "Hello "
ENCODED_ENTITY: "&", "&amp;"
ATTR_VALUE_TEXT: " goodbye"
```

- ENCODED_ENTITY tokens have two parts: "decoded" and "encoded".
- ENCODED_ENTITY tokens are always preceded and followed by either TEXT tokens
  or ATTR_VALUE_TEXT tokens, depending upon the context, even if they represent
  an empty string.

The HTML parser has been modified to recombine these tokens to allow this
refactoring to have limited effect in this commit. Further refactorings
to use these new tokens will follow in subsequent commits.

PR Close #42062
This commit is contained in:
Pete Bacon Darwin 2021-05-14 18:53:17 +01:00 committed by atscott
parent c516e252fc
commit 942b24d5ea
3 changed files with 98 additions and 58 deletions

View File

@ -23,6 +23,7 @@ export enum TokenType {
ESCAPABLE_RAW_TEXT, ESCAPABLE_RAW_TEXT,
RAW_TEXT, RAW_TEXT,
INTERPOLATION, INTERPOLATION,
ENCODED_ENTITY,
COMMENT_START, COMMENT_START,
COMMENT_END, COMMENT_END,
CDATA_START, CDATA_START,
@ -395,19 +396,16 @@ class _Tokenizer {
} }
} }
private _readChar(decodeEntities: boolean): string { private _readChar(): string {
if (decodeEntities && this._cursor.peek() === chars.$AMPERSAND) { // Don't rely upon reading directly from `_input` as the actual char value
return this._decodeEntity(); // may have been generated from an escape sequence.
} else { const char = String.fromCodePoint(this._cursor.peek());
// Don't rely upon reading directly from `_input` as the actual char value this._cursor.advance();
// may have been generated from an escape sequence. return char;
const char = String.fromCodePoint(this._cursor.peek());
this._cursor.advance();
return char;
}
} }
private _decodeEntity(): string { private _consumeEntity(textTokenType: TokenType): void {
this._beginToken(TokenType.ENCODED_ENTITY);
const start = this._cursor.clone(); const start = this._cursor.clone();
this._cursor.advance(); this._cursor.advance();
if (this._attemptCharCode(chars.$HASH)) { if (this._attemptCharCode(chars.$HASH)) {
@ -427,7 +425,7 @@ class _Tokenizer {
this._cursor.advance(); this._cursor.advance();
try { try {
const charCode = parseInt(strNum, isHex ? 16 : 10); const charCode = parseInt(strNum, isHex ? 16 : 10);
return String.fromCharCode(charCode); this._endToken([String.fromCharCode(charCode), this._cursor.getChars(start)]);
} catch { } catch {
throw this._createError( throw this._createError(
_unknownEntityErrorMsg(this._cursor.getChars(start)), this._cursor.getSpan()); _unknownEntityErrorMsg(this._cursor.getChars(start)), this._cursor.getSpan());
@ -436,21 +434,25 @@ class _Tokenizer {
const nameStart = this._cursor.clone(); const nameStart = this._cursor.clone();
this._attemptCharCodeUntilFn(isNamedEntityEnd); this._attemptCharCodeUntilFn(isNamedEntityEnd);
if (this._cursor.peek() != chars.$SEMICOLON) { if (this._cursor.peek() != chars.$SEMICOLON) {
// No semicolon was found so abort the encoded entity token that was in progress, and treat
// this as a text token
this._beginToken(textTokenType, start);
this._cursor = nameStart; this._cursor = nameStart;
return '&'; this._endToken(['&']);
} else {
const name = this._cursor.getChars(nameStart);
this._cursor.advance();
const char = NAMED_ENTITIES[name];
if (!char) {
throw this._createError(_unknownEntityErrorMsg(name), this._cursor.getSpan(start));
}
this._endToken([char, `&${name};`]);
} }
const name = this._cursor.getChars(nameStart);
this._cursor.advance();
const char = NAMED_ENTITIES[name];
if (!char) {
throw this._createError(_unknownEntityErrorMsg(name), this._cursor.getSpan(start));
}
return char;
} }
} }
private _consumeRawText(decodeEntities: boolean, endMarkerPredicate: () => boolean): Token { private _consumeRawText(consumeEntities: boolean, endMarkerPredicate: () => boolean): void {
this._beginToken(decodeEntities ? TokenType.ESCAPABLE_RAW_TEXT : TokenType.RAW_TEXT); this._beginToken(consumeEntities ? TokenType.ESCAPABLE_RAW_TEXT : TokenType.RAW_TEXT);
const parts: string[] = []; const parts: string[] = [];
while (true) { while (true) {
const tagCloseStart = this._cursor.clone(); const tagCloseStart = this._cursor.clone();
@ -459,9 +461,16 @@ class _Tokenizer {
if (foundEndMarker) { if (foundEndMarker) {
break; break;
} }
parts.push(this._readChar(decodeEntities)); if (consumeEntities && this._cursor.peek() === chars.$AMPERSAND) {
this._endToken([this._processCarriageReturns(parts.join(''))]);
parts.length = 0;
this._consumeEntity(TokenType.ESCAPABLE_RAW_TEXT);
this._beginToken(TokenType.ESCAPABLE_RAW_TEXT);
} else {
parts.push(this._readChar());
}
} }
return this._endToken([this._processCarriageReturns(parts.join(''))]); this._endToken([this._processCarriageReturns(parts.join(''))]);
} }
private _consumeComment(start: CharacterCursor) { private _consumeComment(start: CharacterCursor) {
@ -563,8 +572,8 @@ class _Tokenizer {
} }
} }
private _consumeRawTextWithTagClose(prefix: string, tagName: string, decodeEntities: boolean) { private _consumeRawTextWithTagClose(prefix: string, tagName: string, consumeEntities: boolean) {
this._consumeRawText(decodeEntities, () => { this._consumeRawText(consumeEntities, () => {
if (!this._attemptCharCode(chars.$LT)) return false; if (!this._attemptCharCode(chars.$LT)) return false;
if (!this._attemptCharCode(chars.$SLASH)) return false; if (!this._attemptCharCode(chars.$SLASH)) return false;
this._attemptCharCodeUntilFn(isNotWhitespace); this._attemptCharCodeUntilFn(isNotWhitespace);
@ -712,11 +721,16 @@ class _Tokenizer {
const current = this._cursor.clone(); const current = this._cursor.clone();
if (this._interpolationConfig && this._attemptStr(this._interpolationConfig.start)) { if (this._interpolationConfig && this._attemptStr(this._interpolationConfig.start)) {
this._endToken([this._processCarriageReturns(parts.join(''))], current); this._endToken([this._processCarriageReturns(parts.join(''))], current);
this._consumeInterpolation(interpolationTokenType, current);
parts.length = 0; parts.length = 0;
this._consumeInterpolation(interpolationTokenType, current);
this._beginToken(textTokenType);
} else if (this._cursor.peek() === chars.$AMPERSAND) {
this._endToken([this._processCarriageReturns(parts.join(''))]);
parts.length = 0;
this._consumeEntity(textTokenType);
this._beginToken(textTokenType); this._beginToken(textTokenType);
} else { } else {
parts.push(this._readChar(true)); parts.push(this._readChar());
} }
} }
@ -895,7 +909,9 @@ function mergeTextTokens(srcTokens: Token[]): Token[] {
let lastDstToken: Token|undefined = undefined; let lastDstToken: Token|undefined = undefined;
for (let i = 0; i < srcTokens.length; i++) { for (let i = 0; i < srcTokens.length; i++) {
const token = srcTokens[i]; const token = srcTokens[i];
if (lastDstToken && lastDstToken.type == TokenType.TEXT && token.type == TokenType.TEXT) { if ((lastDstToken && lastDstToken.type == TokenType.TEXT && token.type == TokenType.TEXT) ||
(lastDstToken && lastDstToken.type == TokenType.ATTR_VALUE_TEXT &&
token.type == TokenType.ATTR_VALUE_TEXT)) {
lastDstToken.parts[0]! += token.parts[0]; lastDstToken.parts[0]! += token.parts[0];
lastDstToken.sourceSpan.end = token.sourceSpan.end; lastDstToken.sourceSpan.end = token.sourceSpan.end;
} else { } else {

View File

@ -226,20 +226,21 @@ class _TreeBuilder {
} }
} }
// For now recombine text and interpolation tokens // For now recombine text, interpolation and entity tokens
if (this._peek.type === lex.TokenType.INTERPOLATION) { while (this._peek.type === lex.TokenType.INTERPOLATION ||
while (this._peek.type === lex.TokenType.INTERPOLATION || this._peek.type === lex.TokenType.TEXT ||
this._peek.type === lex.TokenType.TEXT) { this._peek.type === lex.TokenType.ENCODED_ENTITY) {
token = this._advance(); token = this._advance();
if (token.type === lex.TokenType.INTERPOLATION) { if (token.type === lex.TokenType.INTERPOLATION) {
// For backward compatibility we decode HTML entities that appear in interpolation // For backward compatibility we decode HTML entities that appear in interpolation
// expressions. This is arguably a bug, but it could be a considerable breaking change to // expressions. This is arguably a bug, but it could be a considerable breaking change to
// fix it. It should be addressed in a larger project to refactor the entire parser/lexer // fix it. It should be addressed in a larger project to refactor the entire parser/lexer
// chain after View Engine has been removed. // chain after View Engine has been removed.
text += token.parts.join('').replace(/&([^;]+);/g, decodeEntity); text += token.parts.join('').replace(/&([^;]+);/g, decodeEntity);
} else { } else if (token.type === lex.TokenType.ENCODED_ENTITY) {
text += token.parts.join(''); text += token.parts[0];
} } else {
text += token.parts.join('');
} }
} }
@ -369,16 +370,17 @@ class _TreeBuilder {
this._advance(); this._advance();
} }
// Consume the value // Consume the attribute value
let value = ''; let value = '';
let valueStartSpan: ParseSourceSpan|undefined = undefined; let valueStartSpan: ParseSourceSpan|undefined = undefined;
let valueEnd: ParseLocation|undefined = undefined; let valueEnd: ParseLocation|undefined = undefined;
if (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT) { if (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT) {
valueStartSpan = this._peek.sourceSpan; valueStartSpan = this._peek.sourceSpan;
valueEnd = this._peek.sourceSpan.end; valueEnd = this._peek.sourceSpan.end;
// For now we are recombining text and interpolation tokens // For now recombine text, interpolation and entity tokens
while (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT || while (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT ||
this._peek.type === lex.TokenType.ATTR_VALUE_INTERPOLATION) { this._peek.type === lex.TokenType.ATTR_VALUE_INTERPOLATION ||
this._peek.type === lex.TokenType.ENCODED_ENTITY) {
let valueToken = this._advance(); let valueToken = this._advance();
if (valueToken.type === lex.TokenType.ATTR_VALUE_INTERPOLATION) { if (valueToken.type === lex.TokenType.ATTR_VALUE_INTERPOLATION) {
// For backward compatibility we decode HTML entities that appear in interpolation // For backward compatibility we decode HTML entities that appear in interpolation
@ -386,6 +388,8 @@ class _TreeBuilder {
// fix it. It should be addressed in a larger project to refactor the entire parser/lexer // fix it. It should be addressed in a larger project to refactor the entire parser/lexer
// chain after View Engine has been removed. // chain after View Engine has been removed.
value += valueToken.parts.join('').replace(/&([^;]+);/g, decodeEntity); value += valueToken.parts.join('').replace(/&([^;]+);/g, decodeEntity);
} else if (valueToken.type === lex.TokenType.ENCODED_ENTITY) {
value += valueToken.parts[0];
} else { } else {
value += valueToken.parts.join(''); value += valueToken.parts.join('');
} }

View File

@ -407,7 +407,11 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
[lex.TokenType.TAG_OPEN_START, '', 't'], [lex.TokenType.TAG_OPEN_START, '', 't'],
[lex.TokenType.ATTR_NAME, '', 'a'], [lex.TokenType.ATTR_NAME, '', 'a'],
[lex.TokenType.ATTR_QUOTE, '"'], [lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.ATTR_VALUE_TEXT, 'AA'], [lex.TokenType.ATTR_VALUE_TEXT, ''],
[lex.TokenType.ENCODED_ENTITY, 'A', '&#65;'],
[lex.TokenType.ATTR_VALUE_TEXT, ''],
[lex.TokenType.ENCODED_ENTITY, 'A', '&#x41;'],
[lex.TokenType.ATTR_VALUE_TEXT, ''],
[lex.TokenType.ATTR_QUOTE, '"'], [lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.TAG_OPEN_END], [lex.TokenType.TAG_OPEN_END],
[lex.TokenType.EOF], [lex.TokenType.EOF],
@ -522,50 +526,60 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
describe('entities', () => { describe('entities', () => {
it('should parse named entities', () => { it('should parse named entities', () => {
expect(tokenizeAndHumanizeParts('a&amp;b')).toEqual([ expect(tokenizeAndHumanizeParts('a&amp;b')).toEqual([
[lex.TokenType.TEXT, 'a&b'], [lex.TokenType.TEXT, 'a'],
[lex.TokenType.ENCODED_ENTITY, '&', '&amp;'],
[lex.TokenType.TEXT, 'b'],
[lex.TokenType.EOF], [lex.TokenType.EOF],
]); ]);
}); });
it('should parse hexadecimal entities', () => { it('should parse hexadecimal entities', () => {
expect(tokenizeAndHumanizeParts('&#x41;&#X41;')).toEqual([ expect(tokenizeAndHumanizeParts('&#x41;&#X41;')).toEqual([
[lex.TokenType.TEXT, 'AA'], [lex.TokenType.TEXT, ''],
[lex.TokenType.ENCODED_ENTITY, 'A', '&#x41;'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.ENCODED_ENTITY, 'A', '&#X41;'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.EOF], [lex.TokenType.EOF],
]); ]);
}); });
it('should parse decimal entities', () => { it('should parse decimal entities', () => {
expect(tokenizeAndHumanizeParts('&#65;')).toEqual([ expect(tokenizeAndHumanizeParts('&#65;')).toEqual([
[lex.TokenType.TEXT, 'A'], [lex.TokenType.TEXT, ''],
[lex.TokenType.ENCODED_ENTITY, 'A', '&#65;'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.EOF], [lex.TokenType.EOF],
]); ]);
}); });
it('should store the locations', () => { it('should store the locations', () => {
expect(tokenizeAndHumanizeSourceSpans('a&amp;b')).toEqual([ expect(tokenizeAndHumanizeSourceSpans('a&amp;b')).toEqual([
[lex.TokenType.TEXT, 'a&amp;b'], [lex.TokenType.TEXT, 'a'],
[lex.TokenType.ENCODED_ENTITY, '&amp;'],
[lex.TokenType.TEXT, 'b'],
[lex.TokenType.EOF, ''], [lex.TokenType.EOF, ''],
]); ]);
}); });
it('should report malformed/unknown entities', () => { it('should report malformed/unknown entities', () => {
expect(tokenizeAndHumanizeErrors('&tbo;')).toEqual([[ expect(tokenizeAndHumanizeErrors('&tbo;')).toEqual([[
lex.TokenType.TEXT, lex.TokenType.ENCODED_ENTITY,
'Unknown entity "tbo" - use the "&#<decimal>;" or "&#x<hex>;" syntax', '0:0' 'Unknown entity "tbo" - use the "&#<decimal>;" or "&#x<hex>;" syntax', '0:0'
]]); ]]);
expect(tokenizeAndHumanizeErrors('&#3sdf;')).toEqual([[ expect(tokenizeAndHumanizeErrors('&#3sdf;')).toEqual([[
lex.TokenType.TEXT, lex.TokenType.ENCODED_ENTITY,
'Unable to parse entity "&#3s" - decimal character reference entities must end with ";"', 'Unable to parse entity "&#3s" - decimal character reference entities must end with ";"',
'0:4' '0:4'
]]); ]]);
expect(tokenizeAndHumanizeErrors('&#xasdf;')).toEqual([[ expect(tokenizeAndHumanizeErrors('&#xasdf;')).toEqual([[
lex.TokenType.TEXT, lex.TokenType.ENCODED_ENTITY,
'Unable to parse entity "&#xas" - hexadecimal character reference entities must end with ";"', 'Unable to parse entity "&#xas" - hexadecimal character reference entities must end with ";"',
'0:5' '0:5'
]]); ]]);
expect(tokenizeAndHumanizeErrors('&#xABC')).toEqual([ expect(tokenizeAndHumanizeErrors('&#xABC')).toEqual([
[lex.TokenType.TEXT, 'Unexpected character "EOF"', '0:6'] [lex.TokenType.ENCODED_ENTITY, 'Unexpected character "EOF"', '0:6']
]); ]);
}); });
}); });
@ -643,12 +657,16 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
it('should parse entities', () => { it('should parse entities', () => {
expect(tokenizeAndHumanizeParts('a&amp;b')).toEqual([ expect(tokenizeAndHumanizeParts('a&amp;b')).toEqual([
[lex.TokenType.TEXT, 'a&b'], [lex.TokenType.TEXT, 'a'],
[lex.TokenType.ENCODED_ENTITY, '&', '&amp;'],
[lex.TokenType.TEXT, 'b'],
[lex.TokenType.EOF], [lex.TokenType.EOF],
]); ]);
expect(tokenizeAndHumanizeSourceSpans('a&amp;b')).toEqual([ expect(tokenizeAndHumanizeSourceSpans('a&amp;b')).toEqual([
[lex.TokenType.TEXT, 'a&amp;b'], [lex.TokenType.TEXT, 'a'],
[lex.TokenType.ENCODED_ENTITY, '&amp;'],
[lex.TokenType.TEXT, 'b'],
[lex.TokenType.EOF, ''], [lex.TokenType.EOF, ''],
]); ]);
}); });
@ -894,7 +912,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
expect(tokenizeAndHumanizeParts(`<title>&amp;</title>`)).toEqual([ expect(tokenizeAndHumanizeParts(`<title>&amp;</title>`)).toEqual([
[lex.TokenType.TAG_OPEN_START, '', 'title'], [lex.TokenType.TAG_OPEN_START, '', 'title'],
[lex.TokenType.TAG_OPEN_END], [lex.TokenType.TAG_OPEN_END],
[lex.TokenType.ESCAPABLE_RAW_TEXT, '&'], [lex.TokenType.ESCAPABLE_RAW_TEXT, ''],
[lex.TokenType.ENCODED_ENTITY, '&', '&amp;'],
[lex.TokenType.ESCAPABLE_RAW_TEXT, ''],
[lex.TokenType.TAG_CLOSE, '', 'title'], [lex.TokenType.TAG_CLOSE, '', 'title'],
[lex.TokenType.EOF], [lex.TokenType.EOF],
]); ]);