feat(compiler): Parse and recover on incomplete opening HTML tags (#38681)
Let's say we have a code like ```html <div<span>123</span> ``` Currently this gets parsed into a tree with the element tag `div<span`. This has at least two downsides: - An incorrect diagnostic that `</span>` doesn't close an element is emitted. - A consumer of the parse tree using it for editor services is unable to provide correct completions for the opening `<span>` tag. This patch attempts to fix both issues by instead parsing the code into the same tree that would be parsed for `<div></div><span>123</span>`. In particular, we do this by optimistically scanning an open tag as usual, but if we do not notice a terminating '>', we mark the tag as "incomplete". A parser then emits an error for the incomplete tag and adds a synthetic (recovered) element node to the tree with the incomplete open tag's name. What's the downside of this? For one, a breaking change. <ol> <li> The first breaking change is that `<` symbols that are ambiguously text or opening tags will be parsed as opening tags instead of text in element bodies. Take the code ```html <p>a<b</p> ``` Clearly we cannot have the best of both worlds, and this patch chooses to swap the parsing strategy to support the new feature. Of course, `<` can still be inserted as text via the `<` entity. </li> </ol> Part of #38596 PR Close #38681
This commit is contained in:
parent
49f27e31ed
commit
6ae3b68acf
|
@ -17,6 +17,7 @@ export enum TokenType {
|
||||||
TAG_OPEN_END,
|
TAG_OPEN_END,
|
||||||
TAG_OPEN_END_VOID,
|
TAG_OPEN_END_VOID,
|
||||||
TAG_CLOSE,
|
TAG_CLOSE,
|
||||||
|
INCOMPLETE_TAG_OPEN,
|
||||||
TEXT,
|
TEXT,
|
||||||
ESCAPABLE_RAW_TEXT,
|
ESCAPABLE_RAW_TEXT,
|
||||||
RAW_TEXT,
|
RAW_TEXT,
|
||||||
|
@ -511,8 +512,6 @@ class _Tokenizer {
|
||||||
let tagName: string;
|
let tagName: string;
|
||||||
let prefix: string;
|
let prefix: string;
|
||||||
let openTagToken: Token|undefined;
|
let openTagToken: Token|undefined;
|
||||||
let tokensBeforeTagOpen = this.tokens.length;
|
|
||||||
const innerStart = this._cursor.clone();
|
|
||||||
try {
|
try {
|
||||||
if (!chars.isAsciiLetter(this._cursor.peek())) {
|
if (!chars.isAsciiLetter(this._cursor.peek())) {
|
||||||
throw this._createError(
|
throw this._createError(
|
||||||
|
@ -523,7 +522,8 @@ class _Tokenizer {
|
||||||
prefix = openTagToken.parts[0];
|
prefix = openTagToken.parts[0];
|
||||||
tagName = openTagToken.parts[1];
|
tagName = openTagToken.parts[1];
|
||||||
this._attemptCharCodeUntilFn(isNotWhitespace);
|
this._attemptCharCodeUntilFn(isNotWhitespace);
|
||||||
while (this._cursor.peek() !== chars.$SLASH && this._cursor.peek() !== chars.$GT) {
|
while (this._cursor.peek() !== chars.$SLASH && this._cursor.peek() !== chars.$GT &&
|
||||||
|
this._cursor.peek() !== chars.$LT) {
|
||||||
this._consumeAttributeName();
|
this._consumeAttributeName();
|
||||||
this._attemptCharCodeUntilFn(isNotWhitespace);
|
this._attemptCharCodeUntilFn(isNotWhitespace);
|
||||||
if (this._attemptCharCode(chars.$EQ)) {
|
if (this._attemptCharCode(chars.$EQ)) {
|
||||||
|
@ -535,14 +535,15 @@ class _Tokenizer {
|
||||||
this._consumeTagOpenEnd();
|
this._consumeTagOpenEnd();
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
if (e instanceof _ControlFlowError) {
|
if (e instanceof _ControlFlowError) {
|
||||||
// When the start tag is invalid (including invalid "attributes"), assume we want a "<"
|
|
||||||
this._cursor = innerStart;
|
|
||||||
if (openTagToken) {
|
if (openTagToken) {
|
||||||
this.tokens.length = tokensBeforeTagOpen;
|
// We errored before we could close the opening tag, so it is incomplete.
|
||||||
}
|
openTagToken.type = TokenType.INCOMPLETE_TAG_OPEN;
|
||||||
// Back to back text tokens are merged at the end
|
} else {
|
||||||
|
// When the start tag is invalid, assume we want a "<" as text.
|
||||||
|
// Back to back text tokens are merged at the end.
|
||||||
this._beginToken(TokenType.TEXT, start);
|
this._beginToken(TokenType.TEXT, start);
|
||||||
this._endToken(['<']);
|
this._endToken(['<']);
|
||||||
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -772,8 +773,8 @@ function isNotWhitespace(code: number): boolean {
|
||||||
}
|
}
|
||||||
|
|
||||||
function isNameEnd(code: number): boolean {
|
function isNameEnd(code: number): boolean {
|
||||||
return chars.isWhitespace(code) || code === chars.$GT || code === chars.$SLASH ||
|
return chars.isWhitespace(code) || code === chars.$GT || code === chars.$LT ||
|
||||||
code === chars.$SQ || code === chars.$DQ || code === chars.$EQ;
|
code === chars.$SLASH || code === chars.$SQ || code === chars.$DQ || code === chars.$EQ;
|
||||||
}
|
}
|
||||||
|
|
||||||
function isPrefixEnd(code: number): boolean {
|
function isPrefixEnd(code: number): boolean {
|
||||||
|
|
|
@ -56,7 +56,8 @@ class _TreeBuilder {
|
||||||
|
|
||||||
build(): void {
|
build(): void {
|
||||||
while (this._peek.type !== lex.TokenType.EOF) {
|
while (this._peek.type !== lex.TokenType.EOF) {
|
||||||
if (this._peek.type === lex.TokenType.TAG_OPEN_START) {
|
if (this._peek.type === lex.TokenType.TAG_OPEN_START ||
|
||||||
|
this._peek.type === lex.TokenType.INCOMPLETE_TAG_OPEN) {
|
||||||
this._consumeStartTag(this._advance());
|
this._consumeStartTag(this._advance());
|
||||||
} else if (this._peek.type === lex.TokenType.TAG_CLOSE) {
|
} else if (this._peek.type === lex.TokenType.TAG_CLOSE) {
|
||||||
this._consumeEndTag(this._advance());
|
this._consumeEndTag(this._advance());
|
||||||
|
@ -233,8 +234,7 @@ class _TreeBuilder {
|
||||||
}
|
}
|
||||||
|
|
||||||
private _consumeStartTag(startTagToken: lex.Token) {
|
private _consumeStartTag(startTagToken: lex.Token) {
|
||||||
const prefix = startTagToken.parts[0];
|
const [prefix, name] = startTagToken.parts;
|
||||||
const name = startTagToken.parts[1];
|
|
||||||
const attrs: html.Attribute[] = [];
|
const attrs: html.Attribute[] = [];
|
||||||
while (this._peek.type === lex.TokenType.ATTR_NAME) {
|
while (this._peek.type === lex.TokenType.ATTR_NAME) {
|
||||||
attrs.push(this._consumeAttr(this._advance()));
|
attrs.push(this._consumeAttr(this._advance()));
|
||||||
|
@ -266,6 +266,12 @@ class _TreeBuilder {
|
||||||
// Elements that are self-closed have their `endSourceSpan` set to the full span, as the
|
// Elements that are self-closed have their `endSourceSpan` set to the full span, as the
|
||||||
// element start tag also represents the end tag.
|
// element start tag also represents the end tag.
|
||||||
this._popElement(fullName, span);
|
this._popElement(fullName, span);
|
||||||
|
} else if (startTagToken.type === lex.TokenType.INCOMPLETE_TAG_OPEN) {
|
||||||
|
// We already know the opening tag is not complete, so it is unlikely it has a corresponding
|
||||||
|
// close tag. Let's optimistically parse it as a full element and emit an error.
|
||||||
|
this._popElement(fullName, null);
|
||||||
|
this.errors.push(
|
||||||
|
TreeError.create(fullName, span, `Opening tag "${fullName}" not terminated.`));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -295,7 +301,13 @@ class _TreeBuilder {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private _popElement(fullName: string, endSourceSpan: ParseSourceSpan): boolean {
|
/**
|
||||||
|
* Closes the nearest element with the tag name `fullName` in the parse tree.
|
||||||
|
* `endSourceSpan` is the span of the closing tag, or null if the element does
|
||||||
|
* not have a closing tag (for example, this happens when an incomplete
|
||||||
|
* opening tag is recovered).
|
||||||
|
*/
|
||||||
|
private _popElement(fullName: string, endSourceSpan: ParseSourceSpan|null): boolean {
|
||||||
for (let stackIndex = this._elementStack.length - 1; stackIndex >= 0; stackIndex--) {
|
for (let stackIndex = this._elementStack.length - 1; stackIndex >= 0; stackIndex--) {
|
||||||
const el = this._elementStack[stackIndex];
|
const el = this._elementStack[stackIndex];
|
||||||
if (el.name == fullName) {
|
if (el.name == fullName) {
|
||||||
|
@ -303,7 +315,7 @@ class _TreeBuilder {
|
||||||
// removed from the element stack at this point are closed implicitly, so they won't get
|
// removed from the element stack at this point are closed implicitly, so they won't get
|
||||||
// an end source span (as there is no explicit closing element).
|
// an end source span (as there is no explicit closing element).
|
||||||
el.endSourceSpan = endSourceSpan;
|
el.endSourceSpan = endSourceSpan;
|
||||||
el.sourceSpan.end = endSourceSpan.end || el.sourceSpan.end;
|
el.sourceSpan.end = endSourceSpan !== null ? endSourceSpan.end : el.sourceSpan.end;
|
||||||
|
|
||||||
this._elementStack.splice(stackIndex, this._elementStack.length - stackIndex);
|
this._elementStack.splice(stackIndex, this._elementStack.length - stackIndex);
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -11,7 +11,7 @@ import {HtmlParser, ParseTreeResult, TreeError} from '../../src/ml_parser/html_p
|
||||||
import {TokenType} from '../../src/ml_parser/lexer';
|
import {TokenType} from '../../src/ml_parser/lexer';
|
||||||
import {ParseError} from '../../src/parse_util';
|
import {ParseError} from '../../src/parse_util';
|
||||||
|
|
||||||
import {humanizeDom, humanizeDomSourceSpans, humanizeLineColumn} from './ast_spec_utils';
|
import {humanizeDom, humanizeDomSourceSpans, humanizeLineColumn, humanizeNodes} from './ast_spec_utils';
|
||||||
|
|
||||||
{
|
{
|
||||||
describe('HtmlParser', () => {
|
describe('HtmlParser', () => {
|
||||||
|
@ -622,7 +622,7 @@ import {humanizeDom, humanizeDomSourceSpans, humanizeLineColumn} from './ast_spe
|
||||||
`{a, select, b {foo} % { bar {% bar}}`, 'TestComp', {tokenizeExpansionForms: true});
|
`{a, select, b {foo} % { bar {% bar}}`, 'TestComp', {tokenizeExpansionForms: true});
|
||||||
expect(humanizeErrors(p.errors)).toEqual([
|
expect(humanizeErrors(p.errors)).toEqual([
|
||||||
[
|
[
|
||||||
6,
|
TokenType.RAW_TEXT,
|
||||||
'Unexpected character "EOF" (Do you have an unescaped "{" in your template? Use "{{ \'{\' }}") to escape it.)',
|
'Unexpected character "EOF" (Do you have an unescaped "{" in your template? Use "{{ \'{\' }}") to escape it.)',
|
||||||
'0:36'
|
'0:36'
|
||||||
],
|
],
|
||||||
|
@ -840,14 +840,66 @@ import {humanizeDom, humanizeDomSourceSpans, humanizeLineColumn} from './ast_spe
|
||||||
]]);
|
]]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe('incomplete element tag', () => {
|
||||||
|
it('should parse and report incomplete tags after the tag name', () => {
|
||||||
|
const {errors, rootNodes} = parser.parse('<div<span><div </span>', 'TestComp');
|
||||||
|
|
||||||
|
expect(humanizeNodes(rootNodes, true)).toEqual([
|
||||||
|
[html.Element, 'div', 0, '<div', '<div', null],
|
||||||
|
[html.Element, 'span', 0, '<span><div </span>', '<span>', '</span>'],
|
||||||
|
[html.Element, 'div', 1, '<div ', '<div ', null],
|
||||||
|
]);
|
||||||
|
|
||||||
|
expect(humanizeErrors(errors)).toEqual([
|
||||||
|
['div', 'Opening tag "div" not terminated.', '0:0'],
|
||||||
|
['div', 'Opening tag "div" not terminated.', '0:10'],
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should parse and report incomplete tags after attribute', () => {
|
||||||
|
const {errors, rootNodes} =
|
||||||
|
parser.parse('<div class="hi" sty<span></span>', 'TestComp');
|
||||||
|
|
||||||
|
expect(humanizeNodes(rootNodes, true)).toEqual([
|
||||||
|
[html.Element, 'div', 0, '<div class="hi" sty', '<div class="hi" sty', null],
|
||||||
|
[html.Attribute, 'class', 'hi', 'class="hi"'],
|
||||||
|
[html.Attribute, 'sty', '', 'sty'],
|
||||||
|
[html.Element, 'span', 0, '<span></span>', '<span>', '</span>'],
|
||||||
|
]);
|
||||||
|
|
||||||
|
expect(humanizeErrors(errors)).toEqual([
|
||||||
|
['div', 'Opening tag "div" not terminated.', '0:0'],
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should parse and report incomplete tags after quote', () => {
|
||||||
|
const {errors, rootNodes} = parser.parse('<div "<span></span>', 'TestComp');
|
||||||
|
|
||||||
|
expect(humanizeNodes(rootNodes, true)).toEqual([
|
||||||
|
[html.Element, 'div', 0, '<div ', '<div ', null],
|
||||||
|
[html.Text, '"', 0, '"'],
|
||||||
|
[html.Element, 'span', 0, '<span></span>', '<span>', '</span>'],
|
||||||
|
]);
|
||||||
|
|
||||||
|
expect(humanizeErrors(errors)).toEqual([
|
||||||
|
['div', 'Opening tag "div" not terminated.', '0:0'],
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
it('should report subsequent open tags without proper close tag', () => {
|
it('should report subsequent open tags without proper close tag', () => {
|
||||||
const errors = parser.parse('<div</div>', 'TestComp').errors;
|
const errors = parser.parse('<div</div>', 'TestComp').errors;
|
||||||
expect(errors.length).toEqual(1);
|
expect(errors.length).toEqual(2);
|
||||||
expect(humanizeErrors(errors)).toEqual([[
|
expect(humanizeErrors(errors)).toEqual([
|
||||||
|
['div', 'Opening tag "div" not terminated.', '0:0'],
|
||||||
|
// TODO(ayazhafiz): the following error is unnecessary and can be pruned if we keep
|
||||||
|
// track of the incomplete tag names.
|
||||||
|
[
|
||||||
'div',
|
'div',
|
||||||
'Unexpected closing tag "div". It may happen when the tag has already been closed by another tag. For more info see https://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags',
|
'Unexpected closing tag "div". It may happen when the tag has already been closed by another tag. For more info see https://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags',
|
||||||
'0:4'
|
'0:4'
|
||||||
]]);
|
]
|
||||||
|
]);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should report closing tag for void elements', () => {
|
it('should report closing tag for void elements', () => {
|
||||||
|
|
|
@ -232,6 +232,45 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
|
||||||
[lex.TokenType.EOF, ''],
|
[lex.TokenType.EOF, ''],
|
||||||
]);
|
]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe('tags', () => {
|
||||||
|
it('after tag name', () => {
|
||||||
|
expect(tokenizeAndHumanizeSourceSpans('<div<span><div</span>')).toEqual([
|
||||||
|
[lex.TokenType.INCOMPLETE_TAG_OPEN, '<div'],
|
||||||
|
[lex.TokenType.TAG_OPEN_START, '<span'],
|
||||||
|
[lex.TokenType.TAG_OPEN_END, '>'],
|
||||||
|
[lex.TokenType.INCOMPLETE_TAG_OPEN, '<div'],
|
||||||
|
[lex.TokenType.TAG_CLOSE, '</span>'],
|
||||||
|
[lex.TokenType.EOF, ''],
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('in attribute', () => {
|
||||||
|
expect(tokenizeAndHumanizeSourceSpans('<div class="hi" sty<span></span>')).toEqual([
|
||||||
|
[lex.TokenType.INCOMPLETE_TAG_OPEN, '<div'],
|
||||||
|
[lex.TokenType.ATTR_NAME, 'class'],
|
||||||
|
[lex.TokenType.ATTR_QUOTE, '"'],
|
||||||
|
[lex.TokenType.ATTR_VALUE, 'hi'],
|
||||||
|
[lex.TokenType.ATTR_QUOTE, '"'],
|
||||||
|
[lex.TokenType.ATTR_NAME, 'sty'],
|
||||||
|
[lex.TokenType.TAG_OPEN_START, '<span'],
|
||||||
|
[lex.TokenType.TAG_OPEN_END, '>'],
|
||||||
|
[lex.TokenType.TAG_CLOSE, '</span>'],
|
||||||
|
[lex.TokenType.EOF, ''],
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('after quote', () => {
|
||||||
|
expect(tokenizeAndHumanizeSourceSpans('<div "<span></span>')).toEqual([
|
||||||
|
[lex.TokenType.INCOMPLETE_TAG_OPEN, '<div'],
|
||||||
|
[lex.TokenType.TEXT, '"'],
|
||||||
|
[lex.TokenType.TAG_OPEN_START, '<span'],
|
||||||
|
[lex.TokenType.TAG_OPEN_END, '>'],
|
||||||
|
[lex.TokenType.TAG_CLOSE, '</span>'],
|
||||||
|
[lex.TokenType.EOF, ''],
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('attributes', () => {
|
describe('attributes', () => {
|
||||||
|
@ -554,7 +593,8 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
|
||||||
expect(tokenizeAndHumanizeSourceSpans('<p>a<b</p>')).toEqual([
|
expect(tokenizeAndHumanizeSourceSpans('<p>a<b</p>')).toEqual([
|
||||||
[lex.TokenType.TAG_OPEN_START, '<p'],
|
[lex.TokenType.TAG_OPEN_START, '<p'],
|
||||||
[lex.TokenType.TAG_OPEN_END, '>'],
|
[lex.TokenType.TAG_OPEN_END, '>'],
|
||||||
[lex.TokenType.TEXT, 'a<b'],
|
[lex.TokenType.TEXT, 'a'],
|
||||||
|
[lex.TokenType.INCOMPLETE_TAG_OPEN, '<b'],
|
||||||
[lex.TokenType.TAG_CLOSE, '</p>'],
|
[lex.TokenType.TAG_CLOSE, '</p>'],
|
||||||
[lex.TokenType.EOF, ''],
|
[lex.TokenType.EOF, ''],
|
||||||
]);
|
]);
|
||||||
|
@ -579,25 +619,41 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
|
||||||
|
|
||||||
it('should parse start tags quotes in place of an attribute name as text', () => {
|
it('should parse start tags quotes in place of an attribute name as text', () => {
|
||||||
expect(tokenizeAndHumanizeParts('<t ">')).toEqual([
|
expect(tokenizeAndHumanizeParts('<t ">')).toEqual([
|
||||||
[lex.TokenType.TEXT, '<t ">'],
|
[lex.TokenType.INCOMPLETE_TAG_OPEN, '', 't'],
|
||||||
|
[lex.TokenType.TEXT, '">'],
|
||||||
[lex.TokenType.EOF],
|
[lex.TokenType.EOF],
|
||||||
]);
|
]);
|
||||||
|
|
||||||
expect(tokenizeAndHumanizeParts('<t \'>')).toEqual([
|
expect(tokenizeAndHumanizeParts('<t \'>')).toEqual([
|
||||||
[lex.TokenType.TEXT, '<t \'>'],
|
[lex.TokenType.INCOMPLETE_TAG_OPEN, '', 't'],
|
||||||
|
[lex.TokenType.TEXT, '\'>'],
|
||||||
[lex.TokenType.EOF],
|
[lex.TokenType.EOF],
|
||||||
]);
|
]);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should parse start tags quotes in place of an attribute name (after a valid attribute) as text',
|
it('should parse start tags quotes in place of an attribute name (after a valid attribute)',
|
||||||
() => {
|
() => {
|
||||||
expect(tokenizeAndHumanizeParts('<t a="b" ">')).toEqual([
|
expect(tokenizeAndHumanizeParts('<t a="b" ">')).toEqual([
|
||||||
[lex.TokenType.TEXT, '<t a="b" ">'],
|
[lex.TokenType.INCOMPLETE_TAG_OPEN, '', 't'],
|
||||||
|
[lex.TokenType.ATTR_NAME, '', 'a'],
|
||||||
|
[lex.TokenType.ATTR_QUOTE, '"'],
|
||||||
|
[lex.TokenType.ATTR_VALUE, 'b'],
|
||||||
|
[lex.TokenType.ATTR_QUOTE, '"'],
|
||||||
|
// TODO(ayazhafiz): the " symbol should be a synthetic attribute,
|
||||||
|
// allowing us to complete the opening tag correctly.
|
||||||
|
[lex.TokenType.TEXT, '">'],
|
||||||
[lex.TokenType.EOF],
|
[lex.TokenType.EOF],
|
||||||
]);
|
]);
|
||||||
|
|
||||||
expect(tokenizeAndHumanizeParts('<t a=\'b\' \'>')).toEqual([
|
expect(tokenizeAndHumanizeParts('<t a=\'b\' \'>')).toEqual([
|
||||||
[lex.TokenType.TEXT, '<t a=\'b\' \'>'],
|
[lex.TokenType.INCOMPLETE_TAG_OPEN, '', 't'],
|
||||||
|
[lex.TokenType.ATTR_NAME, '', 'a'],
|
||||||
|
[lex.TokenType.ATTR_QUOTE, '\''],
|
||||||
|
[lex.TokenType.ATTR_VALUE, 'b'],
|
||||||
|
[lex.TokenType.ATTR_QUOTE, '\''],
|
||||||
|
// TODO(ayazhafiz): the ' symbol should be a synthetic attribute,
|
||||||
|
// allowing us to complete the opening tag correctly.
|
||||||
|
[lex.TokenType.TEXT, '\'>'],
|
||||||
[lex.TokenType.EOF],
|
[lex.TokenType.EOF],
|
||||||
]);
|
]);
|
||||||
});
|
});
|
||||||
|
|
Loading…
Reference in New Issue