refactor(compiler): support interpolation tokens when lexing attribute values (#42062)

The lexer now splits interpolation tokens out from attribute value tokens.
Previously the attribute value of `<div attr="Hello, {{ name}}">` would be a single
token. Now it will be three tokens:

```
ATTR_VALUE_TEXT: "Hello, "
ATTR_VALUE_INTERPOLATION: "{{", " name", "}}"
ATTR_VALUE_TEXT: ""
```

- ATTR_VALUE_INTERPOLATION tokens have three parts, "start marker",
  "expression" and "end marker".
- ATTR_VALUE_INTERPOLATION tokens are always preceded and followed
  by TEXT tokens, even if they represent an empty string.

The HTML parser has been modified to recombine these tokens to allow this
refactoring to have limited effect in this commit. Further refactorings
to use these new tokens will follow in subsequent commits.

PR Close #42062
This commit is contained in:
Pete Bacon Darwin 2021-05-13 17:00:56 +01:00 committed by atscott
parent 3d3b69ff81
commit c516e252fc
4 changed files with 143 additions and 68 deletions

View File

@ -29,7 +29,8 @@ export enum TokenType {
CDATA_END,
ATTR_NAME,
ATTR_QUOTE,
ATTR_VALUE,
ATTR_VALUE_TEXT,
ATTR_VALUE_INTERPOLATION,
DOC_TYPE,
EXPANSION_FORM_START,
EXPANSION_CASE_VALUE,
@ -228,7 +229,8 @@ class _Tokenizer {
this._consumeTagOpen(start);
}
} else if (!(this._tokenizeIcu && this._tokenizeExpansionForm())) {
this._consumeText();
this._consumeWithInterpolation(
TokenType.TEXT, TokenType.INTERPOLATION, () => this._isTextEnd());
}
} catch (e) {
this.handleError(e);
@ -595,29 +597,25 @@ class _Tokenizer {
private _consumeAttributeValue() {
let value: string;
if (this._cursor.peek() === chars.$SQ || this._cursor.peek() === chars.$DQ) {
this._beginToken(TokenType.ATTR_QUOTE);
const quoteChar = this._cursor.peek();
this._cursor.advance();
this._endToken([String.fromCodePoint(quoteChar)]);
this._beginToken(TokenType.ATTR_VALUE);
const parts: string[] = [];
while (this._cursor.peek() !== quoteChar) {
parts.push(this._readChar(true));
}
value = parts.join('');
this._endToken([this._processCarriageReturns(value)]);
this._beginToken(TokenType.ATTR_QUOTE);
this._cursor.advance();
this._endToken([String.fromCodePoint(quoteChar)]);
this._consumeQuote(quoteChar);
this._consumeWithInterpolation(
TokenType.ATTR_VALUE_TEXT, TokenType.ATTR_VALUE_INTERPOLATION,
() => this._cursor.peek() === quoteChar);
this._consumeQuote(quoteChar);
} else {
this._beginToken(TokenType.ATTR_VALUE);
const valueStart = this._cursor.clone();
this._requireCharCodeUntilFn(isNameEnd, 1);
value = this._cursor.getChars(valueStart);
this._endToken([this._processCarriageReturns(value)]);
const endPredicate = () => isNameEnd(this._cursor.peek());
this._consumeWithInterpolation(
TokenType.ATTR_VALUE_TEXT, TokenType.ATTR_VALUE_INTERPOLATION, endPredicate);
}
}
private _consumeQuote(quoteChar: number) {
this._beginToken(TokenType.ATTR_QUOTE);
this._requireCharCode(quoteChar);
this._endToken([String.fromCodePoint(quoteChar)]);
}
private _consumeTagOpenEnd() {
const tokenType =
this._attemptCharCode(chars.$SLASH) ? TokenType.TAG_OPEN_END_VOID : TokenType.TAG_OPEN_END;
@ -696,21 +694,31 @@ class _Tokenizer {
this._expansionCaseStack.pop();
}
private _consumeText() {
this._beginToken(TokenType.TEXT);
/**
* Consume a string that may contain interpolation expressions.
* The first token consumed will be of `tokenType` and then there will be alternating
* `interpolationTokenType` and `tokenType` tokens until the `endPredicate()` returns true.
*
* @param textTokenType the kind of tokens to interleave around interpolation tokens.
* @param interpolationTokenType the kind of tokens that contain interpolation.
* @param endPredicate a function that should return true when we should stop consuming.
*/
private _consumeWithInterpolation(
textTokenType: TokenType, interpolationTokenType: TokenType, endPredicate: () => boolean) {
this._beginToken(textTokenType);
const parts: string[] = [];
do {
while (!endPredicate()) {
const current = this._cursor.clone();
if (this._interpolationConfig && this._attemptStr(this._interpolationConfig.start)) {
this._endToken([this._processCarriageReturns(parts.join(''))], current);
this._consumeInterpolation(current);
this._consumeInterpolation(interpolationTokenType, current);
parts.length = 0;
this._beginToken(TokenType.TEXT);
this._beginToken(textTokenType);
} else {
parts.push(this._readChar(true));
}
} while (!this._isTextEnd());
}
// It is possible that an interpolation was started but not ended inside this text token.
// Make sure that we reset the state of the lexer correctly.
@ -719,14 +727,15 @@ class _Tokenizer {
this._endToken([this._processCarriageReturns(parts.join(''))]);
}
private _consumeInterpolation(interpolationStart: CharacterCursor) {
private _consumeInterpolation(
interpolationTokenType: TokenType, interpolationStart: CharacterCursor) {
const parts: string[] = [];
this._beginToken(TokenType.INTERPOLATION, interpolationStart);
this._beginToken(interpolationTokenType, interpolationStart);
parts.push(this._interpolationConfig.start);
// Find the end of the interpolation, ignoring content inside quotes.
const expressionStart = this._cursor.clone();
let inQuote: string|null = null;
let inQuote: number|null = null;
let inComment = false;
while (this._cursor.peek() !== chars.$EOF) {
const current = this._cursor.clone();
@ -752,14 +761,15 @@ class _Tokenizer {
}
}
const char = this._readChar(true);
if (char === '\\') {
const char = this._cursor.peek();
this._cursor.advance();
if (char === chars.$BACKSLASH) {
// Skip the next character because it was escaped.
this._readChar(true);
this._cursor.advance();
} else if (char === inQuote) {
// Exiting the current quoted string
inQuote = null;
} else if (!inComment && /['"`]/.test(char)) {
} else if (!inComment && chars.isQuote(char)) {
// Entering a new quoted string
inQuote = char;
}

View File

@ -6,7 +6,7 @@
* found in the LICENSE file at https://angular.io/license
*/
import {ParseError, ParseSourceSpan} from '../parse_util';
import {ParseError, ParseLocation, ParseSourceSpan} from '../parse_util';
import * as html from './ast';
import {NAMED_ENTITIES} from './entities';
@ -362,27 +362,49 @@ class _TreeBuilder {
private _consumeAttr(attrName: lex.Token): html.Attribute {
const fullName = mergeNsAndName(attrName.parts[0], attrName.parts[1]);
let end = attrName.sourceSpan.end;
let value = '';
let valueSpan: ParseSourceSpan = undefined!;
let attrEnd = attrName.sourceSpan.end;
// Consume any quote
if (this._peek.type === lex.TokenType.ATTR_QUOTE) {
this._advance();
}
if (this._peek.type === lex.TokenType.ATTR_VALUE) {
const valueToken = this._advance();
value = valueToken.parts[0];
end = valueToken.sourceSpan.end;
valueSpan = valueToken.sourceSpan;
// Consume the value
let value = '';
let valueStartSpan: ParseSourceSpan|undefined = undefined;
let valueEnd: ParseLocation|undefined = undefined;
if (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT) {
valueStartSpan = this._peek.sourceSpan;
valueEnd = this._peek.sourceSpan.end;
// For now we are recombining text and interpolation tokens
while (this._peek.type === lex.TokenType.ATTR_VALUE_TEXT ||
this._peek.type === lex.TokenType.ATTR_VALUE_INTERPOLATION) {
let valueToken = this._advance();
if (valueToken.type === lex.TokenType.ATTR_VALUE_INTERPOLATION) {
// For backward compatibility we decode HTML entities that appear in interpolation
// expressions. This is arguably a bug, but it could be a considerable breaking change to
// fix it. It should be addressed in a larger project to refactor the entire parser/lexer
// chain after View Engine has been removed.
value += valueToken.parts.join('').replace(/&([^;]+);/g, decodeEntity);
} else {
value += valueToken.parts.join('');
}
valueEnd = attrEnd = valueToken.sourceSpan.end;
}
}
// Consume any quote
if (this._peek.type === lex.TokenType.ATTR_QUOTE) {
const quoteToken = this._advance();
end = quoteToken.sourceSpan.end;
attrEnd = quoteToken.sourceSpan.end;
}
const keySpan = new ParseSourceSpan(attrName.sourceSpan.start, attrName.sourceSpan.end);
const valueSpan = valueStartSpan && valueEnd &&
new ParseSourceSpan(valueStartSpan.start, valueEnd, valueStartSpan.fullStart);
return new html.Attribute(
fullName, value,
new ParseSourceSpan(attrName.sourceSpan.start, end, attrName.sourceSpan.fullStart), keySpan,
valueSpan);
new ParseSourceSpan(attrName.sourceSpan.start, attrEnd, attrName.sourceSpan.fullStart),
attrName.sourceSpan, valueSpan);
}
private _getParentElement(): html.Element|null {

View File

@ -250,6 +250,19 @@ import {humanizeDom, humanizeDomSourceSpans, humanizeLineColumn, humanizeNodes}
]);
});
it('should decode HTML entities in interpolated attributes', () => {
// Note that the detail of decoding corner-cases is tested in the
// "should decode HTML entities in interpolations" spec.
expect(humanizeDomSourceSpans(parser.parse('<div foo="{{&amp;}}"></div>', 'TestComp')))
.toEqual([
[
html.Element, 'div', 0, '<div foo="{{&amp;}}"></div>', '<div foo="{{&amp;}}">',
'</div>'
],
[html.Attribute, 'foo', '{{&}}', 'foo="{{&amp;}}"']
]);
});
it('should normalize line endings within attribute values', () => {
const result =
parser.parse('<div key=" \r\n line 1 \r\n line 2 "></div>', 'TestComp');

View File

@ -257,7 +257,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
[lex.TokenType.INCOMPLETE_TAG_OPEN, '<div'],
[lex.TokenType.ATTR_NAME, 'class'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.ATTR_VALUE, 'hi'],
[lex.TokenType.ATTR_VALUE_TEXT, 'hi'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.ATTR_NAME, 'sty'],
[lex.TokenType.TAG_OPEN_START, '<span'],
@ -295,15 +295,21 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
[lex.TokenType.TAG_OPEN_START, '', 't'],
[lex.TokenType.ATTR_NAME, '', 'a'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.ATTR_VALUE, '{{v}}'],
[lex.TokenType.ATTR_VALUE_TEXT, ''],
[lex.TokenType.ATTR_VALUE_INTERPOLATION, '{{', 'v', '}}'],
[lex.TokenType.ATTR_VALUE_TEXT, ''],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.ATTR_NAME, '', 'b'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.ATTR_VALUE, 's{{m}}e'],
[lex.TokenType.ATTR_VALUE_TEXT, 's'],
[lex.TokenType.ATTR_VALUE_INTERPOLATION, '{{', 'm', '}}'],
[lex.TokenType.ATTR_VALUE_TEXT, 'e'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.ATTR_NAME, '', 'c'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.ATTR_VALUE, 's{{m//c}}e'],
[lex.TokenType.ATTR_VALUE_TEXT, 's'],
[lex.TokenType.ATTR_VALUE_INTERPOLATION, '{{', 'm//c', '}}'],
[lex.TokenType.ATTR_VALUE_TEXT, 'e'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.TAG_OPEN_END],
[lex.TokenType.EOF],
@ -333,7 +339,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
[lex.TokenType.TAG_OPEN_START, '', 't'],
[lex.TokenType.ATTR_NAME, '', 'a'],
[lex.TokenType.ATTR_QUOTE, '\''],
[lex.TokenType.ATTR_VALUE, 'b'],
[lex.TokenType.ATTR_VALUE_TEXT, 'b'],
[lex.TokenType.ATTR_QUOTE, '\''],
[lex.TokenType.TAG_OPEN_END],
[lex.TokenType.EOF],
@ -345,7 +351,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
[lex.TokenType.TAG_OPEN_START, '', 't'],
[lex.TokenType.ATTR_NAME, '', 'a'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.ATTR_VALUE, 'b'],
[lex.TokenType.ATTR_VALUE_TEXT, 'b'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.TAG_OPEN_END],
[lex.TokenType.EOF],
@ -356,7 +362,31 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
expect(tokenizeAndHumanizeParts('<t a=b>')).toEqual([
[lex.TokenType.TAG_OPEN_START, '', 't'],
[lex.TokenType.ATTR_NAME, '', 'a'],
[lex.TokenType.ATTR_VALUE, 'b'],
[lex.TokenType.ATTR_VALUE_TEXT, 'b'],
[lex.TokenType.TAG_OPEN_END],
[lex.TokenType.EOF],
]);
});
it('should parse attributes with unquoted interpolation value', () => {
expect(tokenizeAndHumanizeParts('<a a={{link.text}}>')).toEqual([
[lex.TokenType.TAG_OPEN_START, '', 'a'],
[lex.TokenType.ATTR_NAME, '', 'a'],
[lex.TokenType.ATTR_VALUE_TEXT, ''],
[lex.TokenType.ATTR_VALUE_INTERPOLATION, '{{', 'link.text', '}}'],
[lex.TokenType.ATTR_VALUE_TEXT, ''],
[lex.TokenType.TAG_OPEN_END],
[lex.TokenType.EOF],
]);
});
it('should parse attributes with empty quoted value', () => {
expect(tokenizeAndHumanizeParts('<t a="">')).toEqual([
[lex.TokenType.TAG_OPEN_START, '', 't'],
[lex.TokenType.ATTR_NAME, '', 'a'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.ATTR_VALUE_TEXT, ''],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.TAG_OPEN_END],
[lex.TokenType.EOF],
]);
@ -366,7 +396,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
expect(tokenizeAndHumanizeParts('<t a = b >')).toEqual([
[lex.TokenType.TAG_OPEN_START, '', 't'],
[lex.TokenType.ATTR_NAME, '', 'a'],
[lex.TokenType.ATTR_VALUE, 'b'],
[lex.TokenType.ATTR_VALUE_TEXT, 'b'],
[lex.TokenType.TAG_OPEN_END],
[lex.TokenType.EOF],
]);
@ -377,7 +407,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
[lex.TokenType.TAG_OPEN_START, '', 't'],
[lex.TokenType.ATTR_NAME, '', 'a'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.ATTR_VALUE, 'AA'],
[lex.TokenType.ATTR_VALUE_TEXT, 'AA'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.TAG_OPEN_END],
[lex.TokenType.EOF],
@ -389,11 +419,11 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
[lex.TokenType.TAG_OPEN_START, '', 't'],
[lex.TokenType.ATTR_NAME, '', 'a'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.ATTR_VALUE, '&amp'],
[lex.TokenType.ATTR_VALUE_TEXT, '&amp'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.ATTR_NAME, '', 'b'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.ATTR_VALUE, 'c&&d'],
[lex.TokenType.ATTR_VALUE_TEXT, 'c&&d'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.TAG_OPEN_END],
[lex.TokenType.EOF],
@ -405,7 +435,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
[lex.TokenType.TAG_OPEN_START, '', 't'],
[lex.TokenType.ATTR_NAME, '', 'a'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.ATTR_VALUE, 'b && c &'],
[lex.TokenType.ATTR_VALUE_TEXT, 'b && c &'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.TAG_OPEN_END],
[lex.TokenType.EOF],
@ -417,7 +447,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
[lex.TokenType.TAG_OPEN_START, '', 't'],
[lex.TokenType.ATTR_NAME, '', 'a'],
[lex.TokenType.ATTR_QUOTE, '\''],
[lex.TokenType.ATTR_VALUE, 't\ne\ns\nt'],
[lex.TokenType.ATTR_VALUE_TEXT, 't\ne\ns\nt'],
[lex.TokenType.ATTR_QUOTE, '\''],
[lex.TokenType.TAG_OPEN_END],
[lex.TokenType.EOF],
@ -428,7 +458,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
expect(tokenizeAndHumanizeSourceSpans('<t a=b>')).toEqual([
[lex.TokenType.TAG_OPEN_START, '<t'],
[lex.TokenType.ATTR_NAME, 'a'],
[lex.TokenType.ATTR_VALUE, 'b'],
[lex.TokenType.ATTR_VALUE_TEXT, 'b'],
[lex.TokenType.TAG_OPEN_END, '>'],
[lex.TokenType.EOF, ''],
]);
@ -436,13 +466,13 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
it('should report missing closing single quote', () => {
expect(tokenizeAndHumanizeErrors('<t a=\'b>')).toEqual([
[lex.TokenType.ATTR_VALUE, 'Unexpected character "EOF"', '0:8'],
[lex.TokenType.ATTR_VALUE_TEXT, 'Unexpected character "EOF"', '0:8'],
]);
});
it('should report missing closing double quote', () => {
expect(tokenizeAndHumanizeErrors('<t a="b>')).toEqual([
[lex.TokenType.ATTR_VALUE, 'Unexpected character "EOF"', '0:8'],
[lex.TokenType.ATTR_VALUE_TEXT, 'Unexpected character "EOF"', '0:8'],
]);
});
});
@ -735,7 +765,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
[lex.TokenType.INCOMPLETE_TAG_OPEN, '', 't'],
[lex.TokenType.ATTR_NAME, '', 'a'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.ATTR_VALUE, 'b'],
[lex.TokenType.ATTR_VALUE_TEXT, 'b'],
[lex.TokenType.ATTR_QUOTE, '"'],
// TODO(ayazhafiz): the " symbol should be a synthetic attribute,
// allowing us to complete the opening tag correctly.
@ -747,7 +777,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
[lex.TokenType.INCOMPLETE_TAG_OPEN, '', 't'],
[lex.TokenType.ATTR_NAME, '', 'a'],
[lex.TokenType.ATTR_QUOTE, '\''],
[lex.TokenType.ATTR_VALUE, 'b'],
[lex.TokenType.ATTR_VALUE_TEXT, 'b'],
[lex.TokenType.ATTR_QUOTE, '\''],
// TODO(ayazhafiz): the ' symbol should be a synthetic attribute,
// allowing us to complete the opening tag correctly.
@ -1538,11 +1568,11 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
[lex.TokenType.TAG_OPEN_START, '', 't'],
[lex.TokenType.ATTR_NAME, '', 'a'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.ATTR_VALUE, 'b'],
[lex.TokenType.ATTR_VALUE_TEXT, 'b'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.ATTR_NAME, '', 'c'],
[lex.TokenType.ATTR_QUOTE, '\''],
[lex.TokenType.ATTR_VALUE, 'd'],
[lex.TokenType.ATTR_VALUE_TEXT, 'd'],
[lex.TokenType.ATTR_QUOTE, '\''],
[lex.TokenType.TAG_OPEN_END],
[lex.TokenType.EOF],
@ -1591,7 +1621,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
[lex.TokenType.TAG_OPEN_START, '', 't'],
[lex.TokenType.ATTR_NAME, '', 'd'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.ATTR_VALUE, 'e'],
[lex.TokenType.ATTR_VALUE_TEXT, 'e'],
[lex.TokenType.ATTR_QUOTE, '"'],
[lex.TokenType.TAG_OPEN_END],
[lex.TokenType.TAG_CLOSE, '', 't'],
@ -1604,7 +1634,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
expect(tokenizeAndHumanizeParts(text, {escapedString: true})).toEqual([
[lex.TokenType.TAG_OPEN_START, '', 't'],
[lex.TokenType.ATTR_NAME, '', 'a'],
[lex.TokenType.ATTR_VALUE, 'b'],
[lex.TokenType.ATTR_VALUE_TEXT, 'b'],
[lex.TokenType.TAG_OPEN_END],
[lex.TokenType.TAG_CLOSE, '', 't'],
[lex.TokenType.EOF],