refactor(compiler): support interpolation tokens when lexing markup (#42062)

The lexer now splits interpolation tokens out from text tokens.

Previously the contents of `<div>Hello, {{ name}}<div>` would be a single
text token. Now it will be three tokens:

```
TEXT: "Hello, "
INTERPOLATION: "{{", " name", "}}"
TEXT: ""
```

- INTERPOLATION tokens have three parts, "start marker", "expression"
  and "end marker".
- INTERPOLATION tokens are always preceded and followed by TEXT tokens,
  even if they represent an empty string.

The HTML parser has been modified to recombine these tokens to allow this
refactoring to have limited effect in this commit. Further refactorings
to use these new tokens will follow in subsequent commits.

PR Close #42062
This commit is contained in:
Pete Bacon Darwin 2021-05-11 17:03:38 +01:00 committed by atscott
parent 75855196e3
commit c8a46bfdcd
4 changed files with 233 additions and 31 deletions

View File

@ -22,6 +22,7 @@ export enum TokenType {
TEXT,
ESCAPABLE_RAW_TEXT,
RAW_TEXT,
INTERPOLATION,
COMMENT_START,
COMMENT_END,
CDATA_START,
@ -285,7 +286,7 @@ class _Tokenizer {
}
const token = new Token(
this._currentTokenType, parts,
this._cursor.getSpan(this._currentTokenStart, this._leadingTriviaCodePoints));
(end ?? this._cursor).getSpan(this._currentTokenStart, this._leadingTriviaCodePoints));
this.tokens.push(token);
this._currentTokenStart = null;
this._currentTokenType = null;
@ -696,19 +697,16 @@ class _Tokenizer {
}
private _consumeText() {
const start = this._cursor.clone();
this._beginToken(TokenType.TEXT, start);
this._beginToken(TokenType.TEXT);
const parts: string[] = [];
do {
const current = this._cursor.clone();
if (this._interpolationConfig && this._attemptStr(this._interpolationConfig.start)) {
parts.push(this._interpolationConfig.start);
this._inInterpolation = true;
} else if (
this._interpolationConfig && this._inInterpolation &&
this._attemptStr(this._interpolationConfig.end)) {
parts.push(this._interpolationConfig.end);
this._inInterpolation = false;
this._endToken([this._processCarriageReturns(parts.join(''))], current);
this._consumeInterpolation(current);
parts.length = 0;
this._beginToken(TokenType.TEXT);
} else {
parts.push(this._readChar(true));
}
@ -721,6 +719,61 @@ class _Tokenizer {
this._endToken([this._processCarriageReturns(parts.join(''))]);
}
private _consumeInterpolation(interpolationStart: CharacterCursor) {
const parts: string[] = [];
this._beginToken(TokenType.INTERPOLATION, interpolationStart);
parts.push(this._interpolationConfig.start);
// Find the end of the interpolation, ignoring content inside quotes.
const expressionStart = this._cursor.clone();
let inQuote: string|null = null;
let inComment = false;
while (this._cursor.peek() !== chars.$EOF) {
const current = this._cursor.clone();
if (this._isTagStart()) {
// We are starting what looks like an HTML element in the middle of this interpolation.
// Reset the cursor to before the `<` character and end the interpolation token.
// (This is actually wrong but here for backward compatibility).
this._cursor = current;
parts.push(this._getProcessedChars(expressionStart, current));
return this._endToken(parts);
}
if (inQuote === null) {
if (this._attemptStr(this._interpolationConfig.end)) {
// We are not in a string, and we hit the end interpolation marker
parts.push(this._getProcessedChars(expressionStart, current));
parts.push(this._interpolationConfig.end);
return this._endToken(parts);
} else if (this._attemptStr('//')) {
// Once we are in a comment we ignore any quotes
inComment = true;
}
}
const char = this._readChar(true);
if (char === '\\') {
// Skip the next character because it was escaped.
this._readChar(true);
} else if (char === inQuote) {
// Exiting the current quoted string
inQuote = null;
} else if (!inComment && /['"`]/.test(char)) {
// Entering a new quoted string
inQuote = char;
}
}
// We hit EOF without finding a closing interpolation marker
parts.push(this._getProcessedChars(expressionStart, this._cursor));
return this._endToken(parts);
}
private _getProcessedChars(start: CharacterCursor, end: CharacterCursor): string {
return this._processCarriageReturns(end.getChars(start))
}
private _isTextEnd(): boolean {
if (this._isTagStart() || this._cursor.peek() === chars.$EOF) {
return true;

View File

@ -9,6 +9,7 @@
import {ParseError, ParseSourceSpan} from '../parse_util';
import * as html from './ast';
import {NAMED_ENTITIES} from './entities';
import * as lex from './lexer';
import {getNsPrefix, mergeNsAndName, splitNsName, TagDefinition} from './tags';
@ -215,6 +216,7 @@ class _TreeBuilder {
}
private _consumeText(token: lex.Token) {
const startSpan = token.sourceSpan;
let text = token.parts[0];
if (text.length > 0 && text[0] == '\n') {
const parent = this._getParentElement();
@ -224,8 +226,29 @@ class _TreeBuilder {
}
}
// For now recombine text and interpolation tokens
if (this._peek.type === lex.TokenType.INTERPOLATION) {
while (this._peek.type === lex.TokenType.INTERPOLATION ||
this._peek.type === lex.TokenType.TEXT) {
token = this._advance();
if (token.type === lex.TokenType.INTERPOLATION) {
// For backward compatibility we decode HTML entities that appear in interpolation
// expressions. This is arguably a bug, but it could be a considerable breaking change to
// fix it. It should be addressed in a larger project to refactor the entire parser/lexer
// chain after View Engine has been removed.
text += token.parts.join('').replace(/&([^;]+);/g, decodeEntity);
} else {
text += token.parts.join('');
}
}
}
if (text.length > 0) {
this._addToParent(new html.Text(text, token.sourceSpan));
const endSpan = token.sourceSpan;
this._addToParent(new html.Text(
text,
new ParseSourceSpan(
startSpan.start, endSpan.end, startSpan.fullStart, startSpan.details)));
}
}
@ -395,3 +418,21 @@ class _TreeBuilder {
function lastOnStack(stack: any[], element: any): boolean {
return stack.length > 0 && stack[stack.length - 1] === element;
}
/**
* Decode the `entity` string, which we believe is the contents of an HTML entity.
*
* If the string is not actually a valid/known entity then just return the original `match` string.
*/
function decodeEntity(match: string, entity: string): string {
if (NAMED_ENTITIES[entity] !== undefined) {
return NAMED_ENTITIES[entity] || match;
}
if (/^#x[a-f0-9]+$/i.test(entity)) {
return String.fromCodePoint(parseInt(entity.slice(2), 16));
}
if (/^#\d+$/.test(entity)) {
return String.fromCodePoint(parseInt(entity.slice(1), 10));
}
return match;
}

View File

@ -675,6 +675,32 @@ import {humanizeDom, humanizeDomSourceSpans, humanizeLineColumn, humanizeNodes}
expect(node.endSourceSpan!.end.offset).toEqual(12);
});
// This checks backward compatibility with a previous version of the lexer, which would
// treat interpolation expressions as regular HTML escapable text.
it('should decode HTML entities in interpolations', () => {
expect(humanizeDomSourceSpans(parser.parse(
'{{&amp;}}' +
'{{&#x25BE;}}' +
'{{&#9662;}}' +
'{{&amp (no semi-colon)}}' +
'{{&#25BE; (invalid decimal)}}',
'TestComp')))
.toEqual([[
html.Text,
'{{&}}' +
'{{\u25BE}}' +
'{{\u25BE}}' +
'{{&amp (no semi-colon)}}' +
'{{&#25BE; (invalid decimal)}}',
0,
'{{&amp;}}' +
'{{&#x25BE;}}' +
'{{&#9662;}}' +
'{{&amp (no semi-colon)}}' +
'{{&#25BE; (invalid decimal)}}',
]]);
});
it('should not set the end source span for void elements', () => {
expect(humanizeDomSourceSpans(parser.parse('<div><br></div>', 'TestComp'))).toEqual([
[html.Element, 'div', 0, '<div><br></div>', '<div>', '</div>'],

View File

@ -549,25 +549,66 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
});
it('should parse interpolation', () => {
expect(tokenizeAndHumanizeParts('{{ a }}b{{ c // comment }}')).toEqual([
[lex.TokenType.TEXT, '{{ a }}b{{ c // comment }}'],
[lex.TokenType.EOF],
expect(tokenizeAndHumanizeParts('{{ a }}b{{ c // comment }}d{{ e "}}" f }}g{{ h // " i }}'))
.toEqual([
[lex.TokenType.TEXT, ''],
[lex.TokenType.INTERPOLATION, '{{', ' a ', '}}'],
[lex.TokenType.TEXT, 'b'],
[lex.TokenType.INTERPOLATION, '{{', ' c // comment ', '}}'],
[lex.TokenType.TEXT, 'd'],
[lex.TokenType.INTERPOLATION, '{{', ' e "}}" f ', '}}'],
[lex.TokenType.TEXT, 'g'],
[lex.TokenType.INTERPOLATION, '{{', ' h // " i ', '}}'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.EOF],
]);
expect(tokenizeAndHumanizeSourceSpans('{{ a }}b{{ c // comment }}')).toEqual([
[lex.TokenType.TEXT, ''],
[lex.TokenType.INTERPOLATION, '{{ a }}'],
[lex.TokenType.TEXT, 'b'],
[lex.TokenType.INTERPOLATION, '{{ c // comment }}'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.EOF, ''],
]);
});
it('should parse interpolation with custom markers', () => {
expect(tokenizeAndHumanizeParts('{% a %}', {interpolationConfig: {start: '{%', end: '%}'}}))
.toEqual([
[lex.TokenType.TEXT, '{% a %}'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.INTERPOLATION, '{%', ' a ', '%}'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.EOF],
]);
});
it('should handle CR & LF', () => {
it('should handle CR & LF in text', () => {
expect(tokenizeAndHumanizeParts('t\ne\rs\r\nt')).toEqual([
[lex.TokenType.TEXT, 't\ne\ns\nt'],
[lex.TokenType.EOF],
]);
expect(tokenizeAndHumanizeSourceSpans('t\ne\rs\r\nt')).toEqual([
[lex.TokenType.TEXT, 't\ne\rs\r\nt'],
[lex.TokenType.EOF, ''],
]);
});
it('should handle CR & LF in interpolation', () => {
expect(tokenizeAndHumanizeParts('{{t\ne\rs\r\nt}}')).toEqual([
[lex.TokenType.TEXT, ''],
[lex.TokenType.INTERPOLATION, '{{', 't\ne\ns\nt', '}}'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.EOF],
]);
expect(tokenizeAndHumanizeSourceSpans('{{t\ne\rs\r\nt}}')).toEqual([
[lex.TokenType.TEXT, ''],
[lex.TokenType.INTERPOLATION, '{{t\ne\rs\r\nt}}'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.EOF, ''],
]);
});
it('should parse entities', () => {
@ -575,6 +616,11 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
[lex.TokenType.TEXT, 'a&b'],
[lex.TokenType.EOF],
]);
expect(tokenizeAndHumanizeSourceSpans('a&amp;b')).toEqual([
[lex.TokenType.TEXT, 'a&amp;b'],
[lex.TokenType.EOF, ''],
]);
});
it('should parse text starting with "&"', () => {
@ -593,7 +639,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
it('should allow "<" in text nodes', () => {
expect(tokenizeAndHumanizeParts('{{ a < b ? c : d }}')).toEqual([
[lex.TokenType.TEXT, '{{ a < b ? c : d }}'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.INTERPOLATION, '{{', ' a < b ? c : d ', '}}'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.EOF],
]);
@ -614,7 +662,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
it('should break out of interpolation in text token on valid start tag', () => {
expect(tokenizeAndHumanizeParts('{{ a <b && c > d }}')).toEqual([
[lex.TokenType.TEXT, '{{ a '],
[lex.TokenType.TEXT, ''],
[lex.TokenType.INTERPOLATION, '{{', ' a '],
[lex.TokenType.TEXT, ''],
[lex.TokenType.TAG_OPEN_START, '', 'b'],
[lex.TokenType.ATTR_NAME, '', '&&'],
[lex.TokenType.ATTR_NAME, '', 'c'],
@ -626,7 +676,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
it('should break out of interpolation in text token on valid comment', () => {
expect(tokenizeAndHumanizeParts('{{ a }<!---->}')).toEqual([
[lex.TokenType.TEXT, '{{ a }'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.INTERPOLATION, '{{', ' a }'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.COMMENT_START],
[lex.TokenType.RAW_TEXT, ''],
[lex.TokenType.COMMENT_END],
@ -637,7 +689,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
it('should break out of interpolation in text token on valid CDATA', () => {
expect(tokenizeAndHumanizeParts('{{ a }<![CDATA[]]>}')).toEqual([
[lex.TokenType.TEXT, '{{ a }'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.INTERPOLATION, '{{', ' a }'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.CDATA_START],
[lex.TokenType.RAW_TEXT, ''],
[lex.TokenType.CDATA_END],
@ -653,13 +707,14 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
.toEqual([
[lex.TokenType.TAG_OPEN_START, '', 'code'],
[lex.TokenType.TAG_OPEN_END],
[lex.TokenType.TEXT, '{{\'<={\'}}'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.INTERPOLATION, '{{', '\'<={\'', '}}'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.TAG_CLOSE, '', 'code'],
[lex.TokenType.EOF],
]);
});
it('should parse start tags quotes in place of an attribute name as text', () => {
expect(tokenizeAndHumanizeParts('<t ">')).toEqual([
[lex.TokenType.INCOMPLETE_TAG_OPEN, '', 't'],
@ -703,18 +758,32 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
it('should be able to escape {', () => {
expect(tokenizeAndHumanizeParts('{{ "{" }}')).toEqual([
[lex.TokenType.TEXT, '{{ "{" }}'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.INTERPOLATION, '{{', ' "{" ', '}}'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.EOF],
]);
});
it('should be able to escape {{', () => {
expect(tokenizeAndHumanizeParts('{{ "{{" }}')).toEqual([
[lex.TokenType.TEXT, '{{ "{{" }}'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.INTERPOLATION, '{{', ' "{{" ', '}}'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.EOF],
]);
});
it('should capture everything up to the end of file in the interpolation expression part if there are mismatched quotes',
() => {
expect(tokenizeAndHumanizeParts('{{ "{{a}}\' }}')).toEqual([
[lex.TokenType.TEXT, ''],
[lex.TokenType.INTERPOLATION, '{{', ' "{{a}}\' }}'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.EOF],
]);
});
it('should treat expansion form as text when they are not parsed', () => {
expect(tokenizeAndHumanizeParts(
'<span>{a, b, =4 {c}}</span>', {tokenizeExpansionForms: false}))
@ -976,7 +1045,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
[lex.TokenType.RAW_TEXT, 'three'],
[lex.TokenType.EXPANSION_CASE_VALUE, '=4'],
[lex.TokenType.EXPANSION_CASE_EXP_START],
[lex.TokenType.TEXT, 'four {{a}}'],
[lex.TokenType.TEXT, 'four '],
[lex.TokenType.INTERPOLATION, '{{', 'a', '}}'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.EXPANSION_CASE_EXP_END],
[lex.TokenType.EXPANSION_FORM_END],
[lex.TokenType.EOF],
@ -1033,7 +1104,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
[lex.TokenType.EXPANSION_CASE_EXP_END],
[lex.TokenType.EXPANSION_CASE_VALUE, '=1'],
[lex.TokenType.EXPANSION_CASE_EXP_START],
[lex.TokenType.TEXT, 'One {{message}}'],
[lex.TokenType.TEXT, 'One '],
[lex.TokenType.INTERPOLATION, '{{', 'message', '}}'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.EXPANSION_CASE_EXP_END],
[lex.TokenType.EXPANSION_FORM_END],
[lex.TokenType.TEXT, '\n'],
@ -1063,7 +1136,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
[lex.TokenType.EXPANSION_CASE_EXP_END],
[lex.TokenType.EXPANSION_CASE_VALUE, '=1'],
[lex.TokenType.EXPANSION_CASE_EXP_START],
[lex.TokenType.TEXT, 'One {{message}}'],
[lex.TokenType.TEXT, 'One '],
[lex.TokenType.INTERPOLATION, '{{', 'message', '}}'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.EXPANSION_CASE_EXP_END],
[lex.TokenType.EXPANSION_FORM_END],
[lex.TokenType.TEXT, '\n'],
@ -1144,7 +1219,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
[lex.TokenType.EXPANSION_CASE_EXP_END],
[lex.TokenType.EXPANSION_CASE_VALUE, '=1'],
[lex.TokenType.EXPANSION_CASE_EXP_START],
[lex.TokenType.TEXT, 'One {{message}}'],
[lex.TokenType.TEXT, 'One '],
[lex.TokenType.INTERPOLATION, '{{', 'message', '}}'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.EXPANSION_CASE_EXP_END],
[lex.TokenType.EXPANSION_FORM_END],
[lex.TokenType.TEXT, '\n'],
@ -1174,7 +1251,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
[lex.TokenType.EXPANSION_CASE_EXP_END],
[lex.TokenType.EXPANSION_CASE_VALUE, '=1'],
[lex.TokenType.EXPANSION_CASE_EXP_START],
[lex.TokenType.TEXT, 'One {{message}}'],
[lex.TokenType.TEXT, 'One '],
[lex.TokenType.INTERPOLATION, '{{', 'message', '}}'],
[lex.TokenType.TEXT, ''],
[lex.TokenType.EXPANSION_CASE_EXP_END],
[lex.TokenType.EXPANSION_FORM_END],
[lex.TokenType.TEXT, '\n'],
@ -1301,8 +1380,11 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
[lex.TokenType.TEXT, '\n \n \n'],
[lex.TokenType.EOF],
]);
expect(tokenizeAndHumanizeParts('\\r \\r \\r', {escapedString: true})).toEqual([
[lex.TokenType.TEXT, '\n \n \n'], // post processing converts `\r` to `\n`
expect(tokenizeAndHumanizeParts('\\r{{\\r}}\\r', {escapedString: true})).toEqual([
// post processing converts `\r` to `\n`
[lex.TokenType.TEXT, '\n'],
[lex.TokenType.INTERPOLATION, '{{', '\n', '}}'],
[lex.TokenType.TEXT, '\n'],
[lex.TokenType.EOF],
]);
expect(tokenizeAndHumanizeParts('\\v \\v \\v', {escapedString: true})).toEqual([