refactor(compiler): support encoded entity tokens when lexing markup (#42062)

The lexer now splits encoded entity tokens out from text and attribute value tokens.

Previously encoded entities would be decoded and the decoded value would be
included as part of the text token of the surrounding text. Now the entities
have their own tokens. There are two scenarios: text and attribute values.

Previously the contents of `<div>Hello &amp; goodbye</div>` would be a single
TEXT token. Now it will be three tokens:

```
TEXT: "Hello "
ENCODED_ENTITY: "&", "&amp;"
TEXT: " goodbye"
```

Previously the attribute value in `<div title="Hello &amp; goodbye">` would be
a single text token. Now it will be three tokens:

```
ATTR_VALUE_TEXT: "Hello "
ENCODED_ENTITY: "&", "&amp;"
ATTR_VALUE_TEXT: " goodbye"
```

- ENCODED_ENTITY tokens have two parts: "decoded" and "encoded".
- ENCODED_ENTITY tokens are always preceded and followed by either TEXT tokens
  or ATTR_VALUE_TEXT tokens, depending upon the context, even if they represent
  an empty string.

The HTML parser has been modified to recombine these tokens to allow this
refactoring to have limited effect in this commit. Further refactorings
to use these new tokens will follow in subsequent commits.

PR Close #42062
This commit is contained in:
Pete Bacon Darwin 2021-05-14 18:53:17 +01:00 committed by atscott
parent c516e252fc
commit 942b24d5ea
3 changed files with 98 additions and 58 deletions

View File

@ -23,6 +23,7 @@ export enum TokenType {
ESCAPABLE_RAW_TEXT,
RAW_TEXT,
INTERPOLATION,
ENCODED_ENTITY,
COMMENT_START,
COMMENT_END,
CDATA_START,
@ -395,19 +396,16 @@ class _Tokenizer {
}
}
private _readChar(decodeEntities: boolean): string {
if (decodeEntities && this._cursor.peek() === chars.$AMPERSAND) {
return this._decodeEntity();
} else {
// Don't rely upon reading directly from `_input` as the actual char value
// may have been generated from an escape sequence.
const char = String.fromCodePoint(this._cursor.peek());
this._cursor.advance();
return char;
}
private _readChar(): string {
// Don't rely upon reading directly from `_input` as the actual char value
// may have been generated from an escape sequence.
const char = String.fromCodePoint(this._cursor.peek());
this._cursor.advance();
return char;
}
private _decodeEntity(): string {
private _consumeEntity(textTokenType: TokenType): void {
this._beginToken(TokenType.ENCODED_ENTITY);
const start = this._cursor.clone();
this._cursor.advance();
if (this._attemptCharCode(chars.$HASH)) {
@ -427,7 +425,7 @@ class _Tokenizer {
this._cursor.advance();
try {
const charCode = parseInt(strNum, isHex ? 16 : 10);
return String.fromCharCode(charCode);
this._endToken([String.fromCharCode(charCode), this._cursor.getChars(start)]);
} catch {
throw this._createError(
_unknownEntityErrorMsg(this._cursor.getChars(start)), this._cursor.getSpan());
@ -436,21 +434,25 @@ class _Tokenizer {
const nameStart = this._cursor.clone();
this._attemptCharCodeUntilFn(isNamedEntityEnd);
if (this._cursor.peek() != chars.$SEMICOLON) {
// No semicolon was found so abort the encoded entity token that was in progress, and treat
// this as a text token
this._beginToken(textTokenType, start);
this._cursor = nameStart;
return '&';
this._endToken(['&']);
} else {
const name = this._cursor.getChars(nameStart);
this._cursor.advance();
const char = NAMED_ENTITIES[name];
if (!char) {
throw this._createError(_unknownEntityErrorMsg(name), this._cursor.getSpan(start));
}
this._endToken([char, `&${name};`]);
}
const name = this._cursor.getChars(nameStart);
this._cursor.advance();
const char = NAMED_ENTITIES[name];
if (!char) {
throw this._createError(_unknownEntityErrorMsg(name), this._cursor.getSpan(start));
}
return char;
}
}
private _consumeRawText(decodeEntities: boolean, endMarkerPredicate: () => boolean): Token {
this._beginToken(decodeEntities ? TokenType.ESCAPABLE_RAW_TEXT : TokenType.RAW_TEXT);
private _consumeRawText(consumeEntities: boolean, endMarkerPredicate: () => boolean): void {
this._beginToken(consumeEntities ? TokenType.ESCAPABLE_RAW_TEXT : TokenType.RAW_TEXT);
const parts: string[] = [];
while (true) {
const tagCloseStart = this._cursor.clone();
@ -459,9 +461,16 @@ class _Tokenizer {
if (foundEndMarker) {
break;
}
parts.push(this._readChar(decodeEntities));
if (consumeEntities && this._cursor.peek() === chars.$AMPERSAND) {
this._endToken([this._processCarriageReturns(parts.join(''))]);
parts.length = 0;
this._consumeEntity(TokenType.ESCAPABLE_RAW_TEXT);
this._beginToken(TokenType.ESCAPABLE_RAW_TEXT);
} else {
parts.push(this._readChar());
}
}
return this._endToken([this._processCarriageReturns(parts.join(''))]);
this._endToken([this._processCarriageReturns(parts.join(''))]);
}
private _consumeComment(start: CharacterCursor) {
@ -563,8 +572,8 @@ class _Tokenizer {
}
}
private _consumeRawTextWithTagClose(prefix: string, tagName: string, decodeEntities: boolean) {
this._consumeRawText(decodeEntities, () => {
private _consumeRawTextWithTagClose(prefix: string, tagName: string, consumeEntities: boolean) {
this._consumeRawText(consumeEntities, () => {
if (!this._attemptCharCode(chars.$LT)) return false;
if (!this._attemptCharCode(chars.$SLASH)) return false;
this._attemptCharCodeUntilFn(isNotWhitespace);
@ -712,11 +721,16 @@ class _Tokenizer {
const current = this._cursor.clone();
if (this._interpolationConfig && this._attemptStr(this._interpolationConfig.start)) {
this._endToken([this._processCarriageReturns(parts.join(''))], current);
this._consumeInterpolation(interpolationTokenType, current);
parts.length = 0;
this._consumeInterpolation(interpolationTokenType, current);
this._beginToken(textTokenType);
} else if (this._cursor.peek() === chars.$AMPERSAND) {
this._endToken([this._processCarriageReturns(parts.join(''))]);
parts.length = 0;
this._consumeEntity(textTokenType);
this._beginToken(textTokenType);
} else {
parts.push(this._readChar(true));
parts.push(this._readChar());
}
}
@ -895,7 +909,9 @@ function mergeTextTokens(srcTokens: Token[]): Token[] {
let lastDstToken: Token|undefined = undefined;
for (let i = 0; i < srcTokens.length; i++) {
const token = srcTokens[i];
if (lastDstToken && lastDstToken.type == TokenType.TEXT && token.type == TokenType.TEXT) {
if ((lastDstToken && lastDstToken.type == TokenType.TEXT && token.type == TokenType.TEXT) ||
(lastDstToken && lastDstToken.type == TokenType.ATTR_VALUE_TEXT &&
token.type == TokenType.ATTR_VALUE_TEXT)) {
lastDstToken.parts[0]! += token.parts[0];
lastDstToken.sourceSpan.end = token.sourceSpan.end;
} else {

View File

@ -226,20 +226,21 @@ class _TreeBuilder {
}
}
// For now recombine text and interpolation tokens
if (this._peek.type === lex.TokenType.INTERPOLATION) {
while (this._peek.type === lex.TokenType.INTERPOLATION ||
this._peek.type === lex.TokenType.TEXT) {
token = this._advance();
if (token.type === lex.TokenType.INTERPOLATION) {
// For backward compatibility we decode HTML entities that appear in interpolation
// expressions. This is arguably a bug, but it could be a considerable breaking change to
// fix it. It should be addressed in a larger project to refactor the entire parser/lexer
// chain after View Engine has been removed.
text += token.parts.join('').replace(/&([^;]+);/g, decodeEntity);
} else {
text += token.parts.join('');
}
// For now recombine text, interpolation and entity tokens
while (this._peek.type === lex.TokenType.INTERPOLATION ||
this._peek.type === lex.TokenType.TEXT ||
this._peek.type === lex.TokenType.ENCODED_ENTITY) {
token = this._advance();
if (token.type === lex.TokenType.INTERPOLATION) {
// For backward compatibility we decode HTML entities that appear in interpolation
// expressions. This is arguably a bug, but it could be a considerable breaking change to
// fix it. It should be addressed in a larger project to refactor the entire parser/lexer
// chain after View Engine has been removed.
text += token.parts.join('').replace(/&([^;]+);/g, decodeEntity);
} else if (token.type === lex.TokenType.ENCODED_ENTITY) {
text += token.parts[0];
} else {
text += token.parts.join('');
}
}
@ -369,16 +370,17 @@ class _TreeBuilder {
this._advance();
}
// Consume the value
// Consume the attribute value
let value = '';
let valueStartSpan: ParseSourceSpan|undefined = undefined;
let valueEnd: ParseLocation|undefined = undefined;
if (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT) {
valueStartSpan = this._peek.sourceSpan;
valueEnd = this._peek.sourceSpan.end;
// For now we are recombining text and interpolation tokens
// For now recombine text, interpolation and entity tokens
while (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT ||
this._peek.type === lex.TokenType.ATTR_VALUE_INTERPOLATION) {
this._peek.type === lex.TokenType.ATTR_VALUE_INTERPOLATION ||
this._peek.type === lex.TokenType.ENCODED_ENTITY) {
let valueToken = this._advance();
if (valueToken.type === lex.TokenType.ATTR_VALUE_INTERPOLATION) {
// For backward compatibility we decode HTML entities that appear in interpolation
@ -386,6 +388,8 @@ class _TreeBuilder {
// fix it. It should be addressed in a larger project to refactor the entire parser/lexer
// chain after View Engine has been removed.
value += valueToken.parts.join('').replace(/&([^;]+);/g, decodeEntity);
} else if (valueToken.type === lex.TokenType.ENCODED_ENTITY) {
value += valueToken.parts[0];
} else {
value += valueToken.parts.join('');
}

View File

@ -407,7 +407,11 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
[lex.TokenType.TAG_OPEN_START, '', 't'],
[lex.TokenType.ATTR_NAME, '', 'a'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.ATTR_VALUE_TEXT, 'AA'],
[lex.TokenType.ATTR_VALUE_TEXT, ''],
[lex.TokenType.ENCODED_ENTITY, 'A', '&#65;'],
[lex.TokenType.ATTR_VALUE_TEXT, ''],
[lex.TokenType.ENCODED_ENTITY, 'A', '&#x41;'],
[lex.TokenType.ATTR_VALUE_TEXT, ''],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.TAG_OPEN_END],
[lex.TokenType.EOF],
@ -522,50 +526,60 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
describe('entities', () => {
it('should parse named entities', () => {
expect(tokenizeAndHumanizeParts('a&amp;b')).toEqual([
[lex.TokenType.TEXT, 'a&b'],
[lex.TokenType.TEXT, 'a'],
[lex.TokenType.ENCODED_ENTITY, '&', '&amp;'],
[lex.TokenType.TEXT, 'b'],
[lex.TokenType.EOF],
]);
});
it('should parse hexadecimal entities', () => {
expect(tokenizeAndHumanizeParts('&#x41;&#X41;')).toEqual([
[lex.TokenType.TEXT, 'AA'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.ENCODED_ENTITY, 'A', '&#x41;'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.ENCODED_ENTITY, 'A', '&#X41;'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.EOF],
]);
});
it('should parse decimal entities', () => {
expect(tokenizeAndHumanizeParts('&#65;')).toEqual([
[lex.TokenType.TEXT, 'A'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.ENCODED_ENTITY, 'A', '&#65;'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.EOF],
]);
});
it('should store the locations', () => {
expect(tokenizeAndHumanizeSourceSpans('a&amp;b')).toEqual([
[lex.TokenType.TEXT, 'a&amp;b'],
[lex.TokenType.TEXT, 'a'],
[lex.TokenType.ENCODED_ENTITY, '&amp;'],
[lex.TokenType.TEXT, 'b'],
[lex.TokenType.EOF, ''],
]);
});
it('should report malformed/unknown entities', () => {
expect(tokenizeAndHumanizeErrors('&tbo;')).toEqual([[
lex.TokenType.TEXT,
lex.TokenType.ENCODED_ENTITY,
'Unknown entity "tbo" - use the "&#<decimal>;" or "&#x<hex>;" syntax', '0:0'
]]);
expect(tokenizeAndHumanizeErrors('&#3sdf;')).toEqual([[
lex.TokenType.TEXT,
lex.TokenType.ENCODED_ENTITY,
'Unable to parse entity "&#3s" - decimal character reference entities must end with ";"',
'0:4'
]]);
expect(tokenizeAndHumanizeErrors('&#xasdf;')).toEqual([[
lex.TokenType.TEXT,
lex.TokenType.ENCODED_ENTITY,
'Unable to parse entity "&#xas" - hexadecimal character reference entities must end with ";"',
'0:5'
]]);
expect(tokenizeAndHumanizeErrors('&#xABC')).toEqual([
[lex.TokenType.TEXT, 'Unexpected character "EOF"', '0:6']
[lex.TokenType.ENCODED_ENTITY, 'Unexpected character "EOF"', '0:6']
]);
});
});
@ -643,12 +657,16 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
it('should parse entities', () => {
expect(tokenizeAndHumanizeParts('a&amp;b')).toEqual([
[lex.TokenType.TEXT, 'a&b'],
[lex.TokenType.TEXT, 'a'],
[lex.TokenType.ENCODED_ENTITY, '&', '&amp;'],
[lex.TokenType.TEXT, 'b'],
[lex.TokenType.EOF],
]);
expect(tokenizeAndHumanizeSourceSpans('a&amp;b')).toEqual([
[lex.TokenType.TEXT, 'a&amp;b'],
[lex.TokenType.TEXT, 'a'],
[lex.TokenType.ENCODED_ENTITY, '&amp;'],
[lex.TokenType.TEXT, 'b'],
[lex.TokenType.EOF, ''],
]);
});
@ -894,7 +912,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
expect(tokenizeAndHumanizeParts(`<title>&amp;</title>`)).toEqual([
[lex.TokenType.TAG_OPEN_START, '', 'title'],
[lex.TokenType.TAG_OPEN_END],
[lex.TokenType.ESCAPABLE_RAW_TEXT, '&'],
[lex.TokenType.ESCAPABLE_RAW_TEXT, ''],
[lex.TokenType.ENCODED_ENTITY, '&', '&amp;'],
[lex.TokenType.ESCAPABLE_RAW_TEXT, ''],
[lex.TokenType.TAG_CLOSE, '', 'title'],
[lex.TokenType.EOF],
]);