diff --git a/packages/compiler/src/ml_parser/lexer.ts b/packages/compiler/src/ml_parser/lexer.ts index f0fb361232..d62a54f576 100644 --- a/packages/compiler/src/ml_parser/lexer.ts +++ b/packages/compiler/src/ml_parser/lexer.ts @@ -22,6 +22,7 @@ export enum TokenType { TEXT, ESCAPABLE_RAW_TEXT, RAW_TEXT, + INTERPOLATION, COMMENT_START, COMMENT_END, CDATA_START, @@ -285,7 +286,7 @@ class _Tokenizer { } const token = new Token( this._currentTokenType, parts, - this._cursor.getSpan(this._currentTokenStart, this._leadingTriviaCodePoints)); + (end ?? this._cursor).getSpan(this._currentTokenStart, this._leadingTriviaCodePoints)); this.tokens.push(token); this._currentTokenStart = null; this._currentTokenType = null; @@ -696,19 +697,16 @@ class _Tokenizer { } private _consumeText() { - const start = this._cursor.clone(); - this._beginToken(TokenType.TEXT, start); + this._beginToken(TokenType.TEXT); const parts: string[] = []; do { + const current = this._cursor.clone(); if (this._interpolationConfig && this._attemptStr(this._interpolationConfig.start)) { - parts.push(this._interpolationConfig.start); - this._inInterpolation = true; - } else if ( - this._interpolationConfig && this._inInterpolation && - this._attemptStr(this._interpolationConfig.end)) { - parts.push(this._interpolationConfig.end); - this._inInterpolation = false; + this._endToken([this._processCarriageReturns(parts.join(''))], current); + this._consumeInterpolation(current); + parts.length = 0; + this._beginToken(TokenType.TEXT); } else { parts.push(this._readChar(true)); } @@ -721,6 +719,61 @@ class _Tokenizer { this._endToken([this._processCarriageReturns(parts.join(''))]); } + private _consumeInterpolation(interpolationStart: CharacterCursor) { + const parts: string[] = []; + this._beginToken(TokenType.INTERPOLATION, interpolationStart); + parts.push(this._interpolationConfig.start); + + // Find the end of the interpolation, ignoring content inside quotes. + const expressionStart = this._cursor.clone(); + let inQuote: string|null = null; + let inComment = false; + while (this._cursor.peek() !== chars.$EOF) { + const current = this._cursor.clone(); + + if (this._isTagStart()) { + // We are starting what looks like an HTML element in the middle of this interpolation. + // Reset the cursor to before the `<` character and end the interpolation token. + // (This is actually wrong but here for backward compatibility). + this._cursor = current; + parts.push(this._getProcessedChars(expressionStart, current)); + return this._endToken(parts); + } + + if (inQuote === null) { + if (this._attemptStr(this._interpolationConfig.end)) { + // We are not in a string, and we hit the end interpolation marker + parts.push(this._getProcessedChars(expressionStart, current)); + parts.push(this._interpolationConfig.end); + return this._endToken(parts); + } else if (this._attemptStr('//')) { + // Once we are in a comment we ignore any quotes + inComment = true; + } + } + + const char = this._readChar(true); + if (char === '\\') { + // Skip the next character because it was escaped. + this._readChar(true); + } else if (char === inQuote) { + // Exiting the current quoted string + inQuote = null; + } else if (!inComment && /['"`]/.test(char)) { + // Entering a new quoted string + inQuote = char; + } + } + + // We hit EOF without finding a closing interpolation marker + parts.push(this._getProcessedChars(expressionStart, this._cursor)); + return this._endToken(parts); + } + + private _getProcessedChars(start: CharacterCursor, end: CharacterCursor): string { + return this._processCarriageReturns(end.getChars(start)) + } + private _isTextEnd(): boolean { if (this._isTagStart() || this._cursor.peek() === chars.$EOF) { return true; diff --git a/packages/compiler/src/ml_parser/parser.ts b/packages/compiler/src/ml_parser/parser.ts index 24465f8e97..fd01357d43 100644 --- a/packages/compiler/src/ml_parser/parser.ts +++ b/packages/compiler/src/ml_parser/parser.ts @@ -9,6 +9,7 @@ import {ParseError, ParseSourceSpan} from '../parse_util'; import * as html from './ast'; +import {NAMED_ENTITIES} from './entities'; import * as lex from './lexer'; import {getNsPrefix, mergeNsAndName, splitNsName, TagDefinition} from './tags'; @@ -215,6 +216,7 @@ class _TreeBuilder { } private _consumeText(token: lex.Token) { + const startSpan = token.sourceSpan; let text = token.parts[0]; if (text.length > 0 && text[0] == '\n') { const parent = this._getParentElement(); @@ -224,8 +226,29 @@ class _TreeBuilder { } } + // For now recombine text and interpolation tokens + if (this._peek.type === lex.TokenType.INTERPOLATION) { + while (this._peek.type === lex.TokenType.INTERPOLATION || + this._peek.type === lex.TokenType.TEXT) { + token = this._advance(); + if (token.type === lex.TokenType.INTERPOLATION) { + // For backward compatibility we decode HTML entities that appear in interpolation + // expressions. This is arguably a bug, but it could be a considerable breaking change to + // fix it. It should be addressed in a larger project to refactor the entire parser/lexer + // chain after View Engine has been removed. + text += token.parts.join('').replace(/&([^;]+);/g, decodeEntity); + } else { + text += token.parts.join(''); + } + } + } + if (text.length > 0) { - this._addToParent(new html.Text(text, token.sourceSpan)); + const endSpan = token.sourceSpan; + this._addToParent(new html.Text( + text, + new ParseSourceSpan( + startSpan.start, endSpan.end, startSpan.fullStart, startSpan.details))); } } @@ -395,3 +418,21 @@ class _TreeBuilder { function lastOnStack(stack: any[], element: any): boolean { return stack.length > 0 && stack[stack.length - 1] === element; } + +/** + * Decode the `entity` string, which we believe is the contents of an HTML entity. + * + * If the string is not actually a valid/known entity then just return the original `match` string. + */ +function decodeEntity(match: string, entity: string): string { + if (NAMED_ENTITIES[entity] !== undefined) { + return NAMED_ENTITIES[entity] || match; + } + if (/^#x[a-f0-9]+$/i.test(entity)) { + return String.fromCodePoint(parseInt(entity.slice(2), 16)); + } + if (/^#\d+$/.test(entity)) { + return String.fromCodePoint(parseInt(entity.slice(1), 10)); + } + return match; +} diff --git a/packages/compiler/test/ml_parser/html_parser_spec.ts b/packages/compiler/test/ml_parser/html_parser_spec.ts index b971d9187a..279bca60d3 100644 --- a/packages/compiler/test/ml_parser/html_parser_spec.ts +++ b/packages/compiler/test/ml_parser/html_parser_spec.ts @@ -675,6 +675,32 @@ import {humanizeDom, humanizeDomSourceSpans, humanizeLineColumn, humanizeNodes} expect(node.endSourceSpan!.end.offset).toEqual(12); }); + // This checks backward compatibility with a previous version of the lexer, which would + // treat interpolation expressions as regular HTML escapable text. + it('should decode HTML entities in interpolations', () => { + expect(humanizeDomSourceSpans(parser.parse( + '{{&}}' + + '{{▾}}' + + '{{▾}}' + + '{{& (no semi-colon)}}' + + '{{BE; (invalid decimal)}}', + 'TestComp'))) + .toEqual([[ + html.Text, + '{{&}}' + + '{{\u25BE}}' + + '{{\u25BE}}' + + '{{& (no semi-colon)}}' + + '{{BE; (invalid decimal)}}', + 0, + '{{&}}' + + '{{▾}}' + + '{{▾}}' + + '{{& (no semi-colon)}}' + + '{{BE; (invalid decimal)}}', + ]]); + }); + it('should not set the end source span for void elements', () => { expect(humanizeDomSourceSpans(parser.parse('

', 'TestComp'))).toEqual([ [html.Element, 'div', 0, '

', '
', '
'], diff --git a/packages/compiler/test/ml_parser/lexer_spec.ts b/packages/compiler/test/ml_parser/lexer_spec.ts index 5c795ed959..54005b28ba 100644 --- a/packages/compiler/test/ml_parser/lexer_spec.ts +++ b/packages/compiler/test/ml_parser/lexer_spec.ts @@ -549,25 +549,66 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u }); it('should parse interpolation', () => { - expect(tokenizeAndHumanizeParts('{{ a }}b{{ c // comment }}')).toEqual([ - [lex.TokenType.TEXT, '{{ a }}b{{ c // comment }}'], - [lex.TokenType.EOF], + expect(tokenizeAndHumanizeParts('{{ a }}b{{ c // comment }}d{{ e "}}" f }}g{{ h // " i }}')) + .toEqual([ + [lex.TokenType.TEXT, ''], + [lex.TokenType.INTERPOLATION, '{{', ' a ', '}}'], + [lex.TokenType.TEXT, 'b'], + [lex.TokenType.INTERPOLATION, '{{', ' c // comment ', '}}'], + [lex.TokenType.TEXT, 'd'], + [lex.TokenType.INTERPOLATION, '{{', ' e "}}" f ', '}}'], + [lex.TokenType.TEXT, 'g'], + [lex.TokenType.INTERPOLATION, '{{', ' h // " i ', '}}'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.EOF], + ]); + + expect(tokenizeAndHumanizeSourceSpans('{{ a }}b{{ c // comment }}')).toEqual([ + [lex.TokenType.TEXT, ''], + [lex.TokenType.INTERPOLATION, '{{ a }}'], + [lex.TokenType.TEXT, 'b'], + [lex.TokenType.INTERPOLATION, '{{ c // comment }}'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.EOF, ''], ]); }); it('should parse interpolation with custom markers', () => { expect(tokenizeAndHumanizeParts('{% a %}', {interpolationConfig: {start: '{%', end: '%}'}})) .toEqual([ - [lex.TokenType.TEXT, '{% a %}'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.INTERPOLATION, '{%', ' a ', '%}'], + [lex.TokenType.TEXT, ''], [lex.TokenType.EOF], ]); }); - it('should handle CR & LF', () => { + it('should handle CR & LF in text', () => { expect(tokenizeAndHumanizeParts('t\ne\rs\r\nt')).toEqual([ [lex.TokenType.TEXT, 't\ne\ns\nt'], [lex.TokenType.EOF], ]); + + expect(tokenizeAndHumanizeSourceSpans('t\ne\rs\r\nt')).toEqual([ + [lex.TokenType.TEXT, 't\ne\rs\r\nt'], + [lex.TokenType.EOF, ''], + ]); + }); + + it('should handle CR & LF in interpolation', () => { + expect(tokenizeAndHumanizeParts('{{t\ne\rs\r\nt}}')).toEqual([ + [lex.TokenType.TEXT, ''], + [lex.TokenType.INTERPOLATION, '{{', 't\ne\ns\nt', '}}'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.EOF], + ]); + + expect(tokenizeAndHumanizeSourceSpans('{{t\ne\rs\r\nt}}')).toEqual([ + [lex.TokenType.TEXT, ''], + [lex.TokenType.INTERPOLATION, '{{t\ne\rs\r\nt}}'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.EOF, ''], + ]); }); it('should parse entities', () => { @@ -575,6 +616,11 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u [lex.TokenType.TEXT, 'a&b'], [lex.TokenType.EOF], ]); + + expect(tokenizeAndHumanizeSourceSpans('a&b')).toEqual([ + [lex.TokenType.TEXT, 'a&b'], + [lex.TokenType.EOF, ''], + ]); }); it('should parse text starting with "&"', () => { @@ -593,7 +639,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u it('should allow "<" in text nodes', () => { expect(tokenizeAndHumanizeParts('{{ a < b ? c : d }}')).toEqual([ - [lex.TokenType.TEXT, '{{ a < b ? c : d }}'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.INTERPOLATION, '{{', ' a < b ? c : d ', '}}'], + [lex.TokenType.TEXT, ''], [lex.TokenType.EOF], ]); @@ -614,7 +662,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u it('should break out of interpolation in text token on valid start tag', () => { expect(tokenizeAndHumanizeParts('{{ a d }}')).toEqual([ - [lex.TokenType.TEXT, '{{ a '], + [lex.TokenType.TEXT, ''], + [lex.TokenType.INTERPOLATION, '{{', ' a '], + [lex.TokenType.TEXT, ''], [lex.TokenType.TAG_OPEN_START, '', 'b'], [lex.TokenType.ATTR_NAME, '', '&&'], [lex.TokenType.ATTR_NAME, '', 'c'], @@ -626,7 +676,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u it('should break out of interpolation in text token on valid comment', () => { expect(tokenizeAndHumanizeParts('{{ a }}')).toEqual([ - [lex.TokenType.TEXT, '{{ a }'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.INTERPOLATION, '{{', ' a }'], + [lex.TokenType.TEXT, ''], [lex.TokenType.COMMENT_START], [lex.TokenType.RAW_TEXT, ''], [lex.TokenType.COMMENT_END], @@ -637,7 +689,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u it('should break out of interpolation in text token on valid CDATA', () => { expect(tokenizeAndHumanizeParts('{{ a }}')).toEqual([ - [lex.TokenType.TEXT, '{{ a }'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.INTERPOLATION, '{{', ' a }'], + [lex.TokenType.TEXT, ''], [lex.TokenType.CDATA_START], [lex.TokenType.RAW_TEXT, ''], [lex.TokenType.CDATA_END], @@ -653,13 +707,14 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u .toEqual([ [lex.TokenType.TAG_OPEN_START, '', 'code'], [lex.TokenType.TAG_OPEN_END], - [lex.TokenType.TEXT, '{{\'<={\'}}'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.INTERPOLATION, '{{', '\'<={\'', '}}'], + [lex.TokenType.TEXT, ''], [lex.TokenType.TAG_CLOSE, '', 'code'], [lex.TokenType.EOF], ]); }); - it('should parse start tags quotes in place of an attribute name as text', () => { expect(tokenizeAndHumanizeParts('')).toEqual([ [lex.TokenType.INCOMPLETE_TAG_OPEN, '', 't'], @@ -703,18 +758,32 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u it('should be able to escape {', () => { expect(tokenizeAndHumanizeParts('{{ "{" }}')).toEqual([ - [lex.TokenType.TEXT, '{{ "{" }}'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.INTERPOLATION, '{{', ' "{" ', '}}'], + [lex.TokenType.TEXT, ''], [lex.TokenType.EOF], ]); }); it('should be able to escape {{', () => { expect(tokenizeAndHumanizeParts('{{ "{{" }}')).toEqual([ - [lex.TokenType.TEXT, '{{ "{{" }}'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.INTERPOLATION, '{{', ' "{{" ', '}}'], + [lex.TokenType.TEXT, ''], [lex.TokenType.EOF], ]); }); + it('should capture everything up to the end of file in the interpolation expression part if there are mismatched quotes', + () => { + expect(tokenizeAndHumanizeParts('{{ "{{a}}\' }}')).toEqual([ + [lex.TokenType.TEXT, ''], + [lex.TokenType.INTERPOLATION, '{{', ' "{{a}}\' }}'], + [lex.TokenType.TEXT, ''], + [lex.TokenType.EOF], + ]); + }); + it('should treat expansion form as text when they are not parsed', () => { expect(tokenizeAndHumanizeParts( '{a, b, =4 {c}}', {tokenizeExpansionForms: false})) @@ -976,7 +1045,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u [lex.TokenType.RAW_TEXT, 'three'], [lex.TokenType.EXPANSION_CASE_VALUE, '=4'], [lex.TokenType.EXPANSION_CASE_EXP_START], - [lex.TokenType.TEXT, 'four {{a}}'], + [lex.TokenType.TEXT, 'four '], + [lex.TokenType.INTERPOLATION, '{{', 'a', '}}'], + [lex.TokenType.TEXT, ''], [lex.TokenType.EXPANSION_CASE_EXP_END], [lex.TokenType.EXPANSION_FORM_END], [lex.TokenType.EOF], @@ -1033,7 +1104,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u [lex.TokenType.EXPANSION_CASE_EXP_END], [lex.TokenType.EXPANSION_CASE_VALUE, '=1'], [lex.TokenType.EXPANSION_CASE_EXP_START], - [lex.TokenType.TEXT, 'One {{message}}'], + [lex.TokenType.TEXT, 'One '], + [lex.TokenType.INTERPOLATION, '{{', 'message', '}}'], + [lex.TokenType.TEXT, ''], [lex.TokenType.EXPANSION_CASE_EXP_END], [lex.TokenType.EXPANSION_FORM_END], [lex.TokenType.TEXT, '\n'], @@ -1063,7 +1136,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u [lex.TokenType.EXPANSION_CASE_EXP_END], [lex.TokenType.EXPANSION_CASE_VALUE, '=1'], [lex.TokenType.EXPANSION_CASE_EXP_START], - [lex.TokenType.TEXT, 'One {{message}}'], + [lex.TokenType.TEXT, 'One '], + [lex.TokenType.INTERPOLATION, '{{', 'message', '}}'], + [lex.TokenType.TEXT, ''], [lex.TokenType.EXPANSION_CASE_EXP_END], [lex.TokenType.EXPANSION_FORM_END], [lex.TokenType.TEXT, '\n'], @@ -1144,7 +1219,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u [lex.TokenType.EXPANSION_CASE_EXP_END], [lex.TokenType.EXPANSION_CASE_VALUE, '=1'], [lex.TokenType.EXPANSION_CASE_EXP_START], - [lex.TokenType.TEXT, 'One {{message}}'], + [lex.TokenType.TEXT, 'One '], + [lex.TokenType.INTERPOLATION, '{{', 'message', '}}'], + [lex.TokenType.TEXT, ''], [lex.TokenType.EXPANSION_CASE_EXP_END], [lex.TokenType.EXPANSION_FORM_END], [lex.TokenType.TEXT, '\n'], @@ -1174,7 +1251,9 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u [lex.TokenType.EXPANSION_CASE_EXP_END], [lex.TokenType.EXPANSION_CASE_VALUE, '=1'], [lex.TokenType.EXPANSION_CASE_EXP_START], - [lex.TokenType.TEXT, 'One {{message}}'], + [lex.TokenType.TEXT, 'One '], + [lex.TokenType.INTERPOLATION, '{{', 'message', '}}'], + [lex.TokenType.TEXT, ''], [lex.TokenType.EXPANSION_CASE_EXP_END], [lex.TokenType.EXPANSION_FORM_END], [lex.TokenType.TEXT, '\n'], @@ -1301,8 +1380,11 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u [lex.TokenType.TEXT, '\n \n \n'], [lex.TokenType.EOF], ]); - expect(tokenizeAndHumanizeParts('\\r \\r \\r', {escapedString: true})).toEqual([ - [lex.TokenType.TEXT, '\n \n \n'], // post processing converts `\r` to `\n` + expect(tokenizeAndHumanizeParts('\\r{{\\r}}\\r', {escapedString: true})).toEqual([ + // post processing converts `\r` to `\n` + [lex.TokenType.TEXT, '\n'], + [lex.TokenType.INTERPOLATION, '{{', '\n', '}}'], + [lex.TokenType.TEXT, '\n'], [lex.TokenType.EOF], ]); expect(tokenizeAndHumanizeParts('\\v \\v \\v', {escapedString: true})).toEqual([