From c8a46bfdcd5dac0044c4322a5b3967163056b339 Mon Sep 17 00:00:00 2001 From: Pete Bacon Darwin Date: Tue, 11 May 2021 17:03:38 +0100 Subject: [PATCH] refactor(compiler): support interpolation tokens when lexing markup (#42062) The lexer now splits interpolation tokens out from text tokens. Previously the contents of `
Hello, {{ name}}
` would be a single text token. Now it will be three tokens: ``` TEXT: "Hello, " INTERPOLATION: "{{", " name", "}}" TEXT: "" ``` - INTERPOLATION tokens have three parts, "start marker", "expression" and "end marker". - INTERPOLATION tokens are always preceded and followed by TEXT tokens, even if they represent an empty string. The HTML parser has been modified to recombine these tokens to allow this refactoring to have limited effect in this commit. Further refactorings to use these new tokens will follow in subsequent commits. PR Close #42062 --- packages/compiler/src/ml_parser/lexer.ts | 73 +++++++++-- packages/compiler/src/ml_parser/parser.ts | 43 +++++- .../test/ml_parser/html_parser_spec.ts | 26 ++++ .../compiler/test/ml_parser/lexer_spec.ts | 122 +++++++++++++++--- 4 files changed, 233 insertions(+), 31 deletions(-) diff --git a/packages/compiler/src/ml_parser/lexer.ts b/packages/compiler/src/ml_parser/lexer.ts index f0fb361232..d62a54f576 100644 --- a/packages/compiler/src/ml_parser/lexer.ts +++ b/packages/compiler/src/ml_parser/lexer.ts @@ -22,6 +22,7 @@ export enum TokenType { TEXT, ESCAPABLE_RAW_TEXT, RAW_TEXT, + INTERPOLATION, COMMENT_START, COMMENT_END, CDATA_START, @@ -285,7 +286,7 @@ class _Tokenizer { } const token = new Token( this._currentTokenType, parts, - this._cursor.getSpan(this._currentTokenStart, this._leadingTriviaCodePoints)); + (end ?? this._cursor).getSpan(this._currentTokenStart, this._leadingTriviaCodePoints)); this.tokens.push(token); this._currentTokenStart = null; this._currentTokenType = null; @@ -696,19 +697,16 @@ class _Tokenizer { } private _consumeText() { - const start = this._cursor.clone(); - this._beginToken(TokenType.TEXT, start); + this._beginToken(TokenType.TEXT); const parts: string[] = []; do { + const current = this._cursor.clone(); if (this._interpolationConfig && this._attemptStr(this._interpolationConfig.start)) { - parts.push(this._interpolationConfig.start); - this._inInterpolation = true; - } else if ( - this._interpolationConfig && this._inInterpolation && - this._attemptStr(this._interpolationConfig.end)) { - parts.push(this._interpolationConfig.end); - this._inInterpolation = false; + this._endToken([this._processCarriageReturns(parts.join(''))], current); + this._consumeInterpolation(current); + parts.length = 0; + this._beginToken(TokenType.TEXT); } else { parts.push(this._readChar(true)); } @@ -721,6 +719,61 @@ class _Tokenizer { this._endToken([this._processCarriageReturns(parts.join(''))]); } + private _consumeInterpolation(interpolationStart: CharacterCursor) { + const parts: string[] = []; + this._beginToken(TokenType.INTERPOLATION, interpolationStart); + parts.push(this._interpolationConfig.start); + + // Find the end of the interpolation, ignoring content inside quotes. + const expressionStart = this._cursor.clone(); + let inQuote: string|null = null; + let inComment = false; + while (this._cursor.peek() !== chars.$EOF) { + const current = this._cursor.clone(); + + if (this._isTagStart()) { + // We are starting what looks like an HTML element in the middle of this interpolation. + // Reset the cursor to before the `<` character and end the interpolation token. + // (This is actually wrong but here for backward compatibility). + this._cursor = current; + parts.push(this._getProcessedChars(expressionStart, current)); + return this._endToken(parts); + } + + if (inQuote === null) { + if (this._attemptStr(this._interpolationConfig.end)) { + // We are not in a string, and we hit the end interpolation marker + parts.push(this._getProcessedChars(expressionStart, current)); + parts.push(this._interpolationConfig.end); + return this._endToken(parts); + } else if (this._attemptStr('//')) { + // Once we are in a comment we ignore any quotes + inComment = true; + } + } + + const char = this._readChar(true); + if (char === '\\') { + // Skip the next character because it was escaped. + this._readChar(true); + } else if (char === inQuote) { + // Exiting the current quoted string + inQuote = null; + } else if (!inComment && /['"`]/.test(char)) { + // Entering a new quoted string + inQuote = char; + } + } + + // We hit EOF without finding a closing interpolation marker + parts.push(this._getProcessedChars(expressionStart, this._cursor)); + return this._endToken(parts); + } + + private _getProcessedChars(start: CharacterCursor, end: CharacterCursor): string { + return this._processCarriageReturns(end.getChars(start)) + } + private _isTextEnd(): boolean { if (this._isTagStart() || this._cursor.peek() === chars.$EOF) { return true; diff --git a/packages/compiler/src/ml_parser/parser.ts b/packages/compiler/src/ml_parser/parser.ts index 24465f8e97..fd01357d43 100644 --- a/packages/compiler/src/ml_parser/parser.ts +++ b/packages/compiler/src/ml_parser/parser.ts @@ -9,6 +9,7 @@ import {ParseError, ParseSourceSpan} from '../parse_util'; import * as html from './ast'; +import {NAMED_ENTITIES} from './entities'; import * as lex from './lexer'; import {getNsPrefix, mergeNsAndName, splitNsName, TagDefinition} from './tags'; @@ -215,6 +216,7 @@ class _TreeBuilder { } private _consumeText(token: lex.Token) { + const startSpan = token.sourceSpan; let text = token.parts[0]; if (text.length > 0 && text[0] == '\n') { const parent = this._getParentElement(); @@ -224,8 +226,29 @@ class _TreeBuilder { } } + // For now recombine text and interpolation tokens + if (this._peek.type === lex.TokenType.INTERPOLATION) { + while (this._peek.type === lex.TokenType.INTERPOLATION || + this._peek.type === lex.TokenType.TEXT) { + token = this._advance(); + if (token.type === lex.TokenType.INTERPOLATION) { + // For backward compatibility we decode HTML entities that appear in interpolation + // expressions. This is arguably a bug, but it could be a considerable breaking change to + // fix it. It should be addressed in a larger project to refactor the entire parser/lexer + // chain after View Engine has been removed. + text += token.parts.join('').replace(/&([^;]+);/g, decodeEntity); + } else { + text += token.parts.join(''); + } + } + } + if (text.length > 0) { - this._addToParent(new html.Text(text, token.sourceSpan)); + const endSpan = token.sourceSpan; + this._addToParent(new html.Text( + text, + new ParseSourceSpan( + startSpan.start, endSpan.end, startSpan.fullStart, startSpan.details))); } } @@ -395,3 +418,21 @@ class _TreeBuilder { function lastOnStack(stack: any[], element: any): boolean { return stack.length > 0 && stack[stack.length - 1] === element; } + +/** + * Decode the `entity` string, which we believe is the contents of an HTML entity. + * + * If the string is not actually a valid/known entity then just return the original `match` string. + */ +function decodeEntity(match: string, entity: string): string { + if (NAMED_ENTITIES[entity] !== undefined) { + return NAMED_ENTITIES[entity] || match; + } + if (/^#x[a-f0-9]+$/i.test(entity)) { + return String.fromCodePoint(parseInt(entity.slice(2), 16)); + } + if (/^#\d+$/.test(entity)) { + return String.fromCodePoint(parseInt(entity.slice(1), 10)); + } + return match; +} diff --git a/packages/compiler/test/ml_parser/html_parser_spec.ts b/packages/compiler/test/ml_parser/html_parser_spec.ts index b971d9187a..279bca60d3 100644 --- a/packages/compiler/test/ml_parser/html_parser_spec.ts +++ b/packages/compiler/test/ml_parser/html_parser_spec.ts @@ -675,6 +675,32 @@ import {humanizeDom, humanizeDomSourceSpans, humanizeLineColumn, humanizeNodes} expect(node.endSourceSpan!.end.offset).toEqual(12); }); + // This checks backward compatibility with a previous version of the lexer, which would + // treat interpolation expressions as regular HTML escapable text. + it('should decode HTML entities in interpolations', () => { + expect(humanizeDomSourceSpans(parser.parse( + '{{&}}' + + '{{▾}}' + + '{{▾}}' + + '{{& (no semi-colon)}}' + + '{{BE; (invalid decimal)}}', + 'TestComp'))) + .toEqual([[ + html.Text, + '{{&}}' + + '{{\u25BE}}' + + '{{\u25BE}}' + + '{{& (no semi-colon)}}' + + '{{BE; (invalid decimal)}}', + 0, + '{{&}}' + + '{{▾}}' + + '{{▾}}' + + '{{& (no semi-colon)}}' + + '{{BE; (invalid decimal)}}', + ]]); + }); + it('should not set the end source span for void elements', () => { expect(humanizeDomSourceSpans(parser.parse('

', 'TestComp'))).toEqual([ [html.Element, 'div', 0, '

', '
', '
'], diff --git a/packages/compiler/test/ml_parser/lexer_spec.ts b/packages/compiler/test/ml_parser/lexer_spec.ts index 5c795ed959..54005b28ba 100644 --- a/packages/compiler/test/ml_parser/lexer_spec.ts +++ b/packages/compiler/test/ml_parser/lexer_spec.ts @@ -549,25 +549,66 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u }); it('should parse interpolation', () => { - expect(tokenizeAndHumanizeParts('{{ a }}b{{ c // comment }}')).toEqual([ - [lex.TokenType.TEXT, '{{ a }}b{{ c // comment }}'], - [lex.TokenType.EOF], + expect(tokenizeAndHumanizeParts('{{ a }}b{{ c // comment }}d{{ e "}}" f }}g{{ h // " i }}')) + .toEqual([ + [lex.TokenType.TEXT, ''], + [lex.TokenType.INTERPOLATION, '{{', ' a ', '}}'], + [lex.TokenType.TEXT, 'b'], + [lex.TokenType.INTERPOLATION, '{{', ' c // comment ', '}}'], + [lex.TokenType.TEXT, 'd'], + [lex.TokenType.INTERPOLATION, '{{', ' e "}}" f ', '}}'], + [lex.TokenType.TEXT, 'g'], + [lex.TokenType.INTERPOLATION, '{{', ' h // " i ', '}}'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.EOF], + ]); + + expect(tokenizeAndHumanizeSourceSpans('{{ a }}b{{ c // comment }}')).toEqual([ + [lex.TokenType.TEXT, ''], + [lex.TokenType.INTERPOLATION, '{{ a }}'], + [lex.TokenType.TEXT, 'b'], + [lex.TokenType.INTERPOLATION, '{{ c // comment }}'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.EOF, ''], ]); }); it('should parse interpolation with custom markers', () => { expect(tokenizeAndHumanizeParts('{% a %}', {interpolationConfig: {start: '{%', end: '%}'}})) .toEqual([ - [lex.TokenType.TEXT, '{% a %}'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.INTERPOLATION, '{%', ' a ', '%}'], + [lex.TokenType.TEXT, ''], [lex.TokenType.EOF], ]); }); - it('should handle CR & LF', () => { + it('should handle CR & LF in text', () => { expect(tokenizeAndHumanizeParts('t\ne\rs\r\nt')).toEqual([ [lex.TokenType.TEXT, 't\ne\ns\nt'], [lex.TokenType.EOF], ]); + + expect(tokenizeAndHumanizeSourceSpans('t\ne\rs\r\nt')).toEqual([ + [lex.TokenType.TEXT, 't\ne\rs\r\nt'], + [lex.TokenType.EOF, ''], + ]); + }); + + it('should handle CR & LF in interpolation', () => { + expect(tokenizeAndHumanizeParts('{{t\ne\rs\r\nt}}')).toEqual([ + [lex.TokenType.TEXT, ''], + [lex.TokenType.INTERPOLATION, '{{', 't\ne\ns\nt', '}}'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.EOF], + ]); + + expect(tokenizeAndHumanizeSourceSpans('{{t\ne\rs\r\nt}}')).toEqual([ + [lex.TokenType.TEXT, ''], + [lex.TokenType.INTERPOLATION, '{{t\ne\rs\r\nt}}'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.EOF, ''], + ]); }); it('should parse entities', () => { @@ -575,6 +616,11 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u [lex.TokenType.TEXT, 'a&b'], [lex.TokenType.EOF], ]); + + expect(tokenizeAndHumanizeSourceSpans('a&b')).toEqual([ + [lex.TokenType.TEXT, 'a&b'], + [lex.TokenType.EOF, ''], + ]); }); it('should parse text starting with "&"', () => { @@ -593,7 +639,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u it('should allow "<" in text nodes', () => { expect(tokenizeAndHumanizeParts('{{ a < b ? c : d }}')).toEqual([ - [lex.TokenType.TEXT, '{{ a < b ? c : d }}'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.INTERPOLATION, '{{', ' a < b ? c : d ', '}}'], + [lex.TokenType.TEXT, ''], [lex.TokenType.EOF], ]); @@ -614,7 +662,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u it('should break out of interpolation in text token on valid start tag', () => { expect(tokenizeAndHumanizeParts('{{ a d }}')).toEqual([ - [lex.TokenType.TEXT, '{{ a '], + [lex.TokenType.TEXT, ''], + [lex.TokenType.INTERPOLATION, '{{', ' a '], + [lex.TokenType.TEXT, ''], [lex.TokenType.TAG_OPEN_START, '', 'b'], [lex.TokenType.ATTR_NAME, '', '&&'], [lex.TokenType.ATTR_NAME, '', 'c'], @@ -626,7 +676,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u it('should break out of interpolation in text token on valid comment', () => { expect(tokenizeAndHumanizeParts('{{ a }}')).toEqual([ - [lex.TokenType.TEXT, '{{ a }'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.INTERPOLATION, '{{', ' a }'], + [lex.TokenType.TEXT, ''], [lex.TokenType.COMMENT_START], [lex.TokenType.RAW_TEXT, ''], [lex.TokenType.COMMENT_END], @@ -637,7 +689,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u it('should break out of interpolation in text token on valid CDATA', () => { expect(tokenizeAndHumanizeParts('{{ a }}')).toEqual([ - [lex.TokenType.TEXT, '{{ a }'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.INTERPOLATION, '{{', ' a }'], + [lex.TokenType.TEXT, ''], [lex.TokenType.CDATA_START], [lex.TokenType.RAW_TEXT, ''], [lex.TokenType.CDATA_END], @@ -653,13 +707,14 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u .toEqual([ [lex.TokenType.TAG_OPEN_START, '', 'code'], [lex.TokenType.TAG_OPEN_END], - [lex.TokenType.TEXT, '{{\'<={\'}}'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.INTERPOLATION, '{{', '\'<={\'', '}}'], + [lex.TokenType.TEXT, ''], [lex.TokenType.TAG_CLOSE, '', 'code'], [lex.TokenType.EOF], ]); }); - it('should parse start tags quotes in place of an attribute name as text', () => { expect(tokenizeAndHumanizeParts('')).toEqual([ [lex.TokenType.INCOMPLETE_TAG_OPEN, '', 't'], @@ -703,18 +758,32 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u it('should be able to escape {', () => { expect(tokenizeAndHumanizeParts('{{ "{" }}')).toEqual([ - [lex.TokenType.TEXT, '{{ "{" }}'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.INTERPOLATION, '{{', ' "{" ', '}}'], + [lex.TokenType.TEXT, ''], [lex.TokenType.EOF], ]); }); it('should be able to escape {{', () => { expect(tokenizeAndHumanizeParts('{{ "{{" }}')).toEqual([ - [lex.TokenType.TEXT, '{{ "{{" }}'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.INTERPOLATION, '{{', ' "{{" ', '}}'], + [lex.TokenType.TEXT, ''], [lex.TokenType.EOF], ]); }); + it('should capture everything up to the end of file in the interpolation expression part if there are mismatched quotes', + () => { + expect(tokenizeAndHumanizeParts('{{ "{{a}}\' }}')).toEqual([ + [lex.TokenType.TEXT, ''], + [lex.TokenType.INTERPOLATION, '{{', ' "{{a}}\' }}'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.EOF], + ]); + }); + it('should treat expansion form as text when they are not parsed', () => { expect(tokenizeAndHumanizeParts( '{a, b, =4 {c}}', {tokenizeExpansionForms: false})) @@ -976,7 +1045,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u [lex.TokenType.RAW_TEXT, 'three'], [lex.TokenType.EXPANSION_CASE_VALUE, '=4'], [lex.TokenType.EXPANSION_CASE_EXP_START], - [lex.TokenType.TEXT, 'four {{a}}'], + [lex.TokenType.TEXT, 'four '], + [lex.TokenType.INTERPOLATION, '{{', 'a', '}}'], + [lex.TokenType.TEXT, ''], [lex.TokenType.EXPANSION_CASE_EXP_END], [lex.TokenType.EXPANSION_FORM_END], [lex.TokenType.EOF], @@ -1033,7 +1104,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u [lex.TokenType.EXPANSION_CASE_EXP_END], [lex.TokenType.EXPANSION_CASE_VALUE, '=1'], [lex.TokenType.EXPANSION_CASE_EXP_START], - [lex.TokenType.TEXT, 'One {{message}}'], + [lex.TokenType.TEXT, 'One '], + [lex.TokenType.INTERPOLATION, '{{', 'message', '}}'], + [lex.TokenType.TEXT, ''], [lex.TokenType.EXPANSION_CASE_EXP_END], [lex.TokenType.EXPANSION_FORM_END], [lex.TokenType.TEXT, '\n'], @@ -1063,7 +1136,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u [lex.TokenType.EXPANSION_CASE_EXP_END], [lex.TokenType.EXPANSION_CASE_VALUE, '=1'], [lex.TokenType.EXPANSION_CASE_EXP_START], - [lex.TokenType.TEXT, 'One {{message}}'], + [lex.TokenType.TEXT, 'One '], + [lex.TokenType.INTERPOLATION, '{{', 'message', '}}'], + [lex.TokenType.TEXT, ''], [lex.TokenType.EXPANSION_CASE_EXP_END], [lex.TokenType.EXPANSION_FORM_END], [lex.TokenType.TEXT, '\n'], @@ -1144,7 +1219,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u [lex.TokenType.EXPANSION_CASE_EXP_END], [lex.TokenType.EXPANSION_CASE_VALUE, '=1'], [lex.TokenType.EXPANSION_CASE_EXP_START], - [lex.TokenType.TEXT, 'One {{message}}'], + [lex.TokenType.TEXT, 'One '], + [lex.TokenType.INTERPOLATION, '{{', 'message', '}}'], + [lex.TokenType.TEXT, ''], [lex.TokenType.EXPANSION_CASE_EXP_END], [lex.TokenType.EXPANSION_FORM_END], [lex.TokenType.TEXT, '\n'], @@ -1174,7 +1251,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u [lex.TokenType.EXPANSION_CASE_EXP_END], [lex.TokenType.EXPANSION_CASE_VALUE, '=1'], [lex.TokenType.EXPANSION_CASE_EXP_START], - [lex.TokenType.TEXT, 'One {{message}}'], + [lex.TokenType.TEXT, 'One '], + [lex.TokenType.INTERPOLATION, '{{', 'message', '}}'], + [lex.TokenType.TEXT, ''], [lex.TokenType.EXPANSION_CASE_EXP_END], [lex.TokenType.EXPANSION_FORM_END], [lex.TokenType.TEXT, '\n'], @@ -1301,8 +1380,11 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u [lex.TokenType.TEXT, '\n \n \n'], [lex.TokenType.EOF], ]); - expect(tokenizeAndHumanizeParts('\\r \\r \\r', {escapedString: true})).toEqual([ - [lex.TokenType.TEXT, '\n \n \n'], // post processing converts `\r` to `\n` + expect(tokenizeAndHumanizeParts('\\r{{\\r}}\\r', {escapedString: true})).toEqual([ + // post processing converts `\r` to `\n` + [lex.TokenType.TEXT, '\n'], + [lex.TokenType.INTERPOLATION, '{{', '\n', '}}'], + [lex.TokenType.TEXT, '\n'], [lex.TokenType.EOF], ]); expect(tokenizeAndHumanizeParts('\\v \\v \\v', {escapedString: true})).toEqual([