fix(HtmlLexer): handle CR in input stream per HTML spec

fixes #5618
Closes #5629
This commit is contained in:
Victor Berchet 2015-12-04 23:12:31 -08:00
parent daaa8ee1cd
commit 9850e68703
2 changed files with 59 additions and 20 deletions

View File

@ -83,6 +83,9 @@ const $x = 120;
const $NBSP = 160; const $NBSP = 160;
var CRLF_REGEXP = /\r\n/g;
var CR_REGEXP = /\r/g;
function unexpectedCharacterErrorMsg(charCode: number): string { function unexpectedCharacterErrorMsg(charCode: number): string {
var char = charCode === $EOF ? 'EOF' : StringWrapper.fromCharCode(charCode); var char = charCode === $EOF ? 'EOF' : StringWrapper.fromCharCode(charCode);
return `Unexpected character "${char}"`; return `Unexpected character "${char}"`;
@ -119,6 +122,14 @@ class _HtmlTokenizer {
this._advance(); this._advance();
} }
private _processCarriageReturns(content: string): string {
// http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
// In order to keep the original position in the source, we can not pre-process it.
// Instead CRs are processed right before instantiating the tokens.
content = StringWrapper.replaceAll(content, CRLF_REGEXP, '\r');
return StringWrapper.replaceAll(content, CR_REGEXP, '\n');
}
tokenize(): HtmlTokenizeResult { tokenize(): HtmlTokenizeResult {
while (this.peek !== $EOF) { while (this.peek !== $EOF) {
var start = this._getLocation(); var start = this._getLocation();
@ -315,7 +326,7 @@ class _HtmlTokenizer {
parts.push(this._readChar(decodeEntities)); parts.push(this._readChar(decodeEntities));
} }
} }
return this._endToken([parts.join('')], tagCloseStart); return this._endToken([this._processCarriageReturns(parts.join(''))], tagCloseStart);
} }
private _consumeComment(start: ParseLocation) { private _consumeComment(start: ParseLocation) {
@ -428,7 +439,7 @@ class _HtmlTokenizer {
this._requireUntilFn(isNameEnd, 1); this._requireUntilFn(isNameEnd, 1);
value = this.input.substring(valueStart, this.index); value = this.input.substring(valueStart, this.index);
} }
this._endToken([value]); this._endToken([this._processCarriageReturns(value)]);
} }
private _consumeTagOpenEnd() { private _consumeTagOpenEnd() {
@ -456,7 +467,7 @@ class _HtmlTokenizer {
while (!isTextEnd(this.peek)) { while (!isTextEnd(this.peek)) {
parts.push(this._readChar(true)); parts.push(this._readChar(true));
} }
this._endToken([parts.join('')]); this._endToken([this._processCarriageReturns(parts.join(''))]);
} }
private _savePosition(): number[] { return [this.peek, this.index, this.column, this.line]; } private _savePosition(): number[] { return [this.peek, this.index, this.column, this.line]; }

View File

@ -53,23 +53,35 @@ export function main() {
[HtmlTokenType.EOF, '2:5'] [HtmlTokenType.EOF, '2:5']
]); ]);
}); });
it('should work with CR and LF', () => {
expect(tokenizeAndHumanizeLineColumn('<t\n>\r\na\r</t>'))
.toEqual([
[HtmlTokenType.TAG_OPEN_START, '0:0'],
[HtmlTokenType.TAG_OPEN_END, '1:0'],
[HtmlTokenType.TEXT, '1:1'],
[HtmlTokenType.TAG_CLOSE, '2:1'],
[HtmlTokenType.EOF, '2:5']
]);
});
}); });
describe('comments', () => { describe('comments', () => {
it('should parse comments', () => { it('should parse comments', () => {
expect(tokenizeAndHumanizeParts('<!--test-->')) expect(tokenizeAndHumanizeParts('<!--t\ne\rs\r\nt-->'))
.toEqual([ .toEqual([
[HtmlTokenType.COMMENT_START], [HtmlTokenType.COMMENT_START],
[HtmlTokenType.RAW_TEXT, 'test'], [HtmlTokenType.RAW_TEXT, 't\ne\ns\nt'],
[HtmlTokenType.COMMENT_END], [HtmlTokenType.COMMENT_END],
[HtmlTokenType.EOF] [HtmlTokenType.EOF]
]); ]);
}); });
it('should store the locations', () => {expect(tokenizeAndHumanizeSourceSpans('<!--test-->')) it('should store the locations',
() => {expect(tokenizeAndHumanizeSourceSpans('<!--t\ne\rs\r\nt-->'))
.toEqual([ .toEqual([
[HtmlTokenType.COMMENT_START, '<!--'], [HtmlTokenType.COMMENT_START, '<!--'],
[HtmlTokenType.RAW_TEXT, 'test'], [HtmlTokenType.RAW_TEXT, 't\ne\rs\r\nt'],
[HtmlTokenType.COMMENT_END, '-->'], [HtmlTokenType.COMMENT_END, '-->'],
[HtmlTokenType.EOF, ''] [HtmlTokenType.EOF, '']
])}); ])});
@ -104,20 +116,20 @@ export function main() {
describe('cdata', () => { describe('cdata', () => {
it('should parse cdata', () => { it('should parse cdata', () => {
expect(tokenizeAndHumanizeParts('<![cdata[test]]>')) expect(tokenizeAndHumanizeParts('<![cdata[t\ne\rs\r\nt]]>'))
.toEqual([ .toEqual([
[HtmlTokenType.CDATA_START], [HtmlTokenType.CDATA_START],
[HtmlTokenType.RAW_TEXT, 'test'], [HtmlTokenType.RAW_TEXT, 't\ne\ns\nt'],
[HtmlTokenType.CDATA_END], [HtmlTokenType.CDATA_END],
[HtmlTokenType.EOF] [HtmlTokenType.EOF]
]); ]);
}); });
it('should store the locations', () => { it('should store the locations', () => {
expect(tokenizeAndHumanizeSourceSpans('<![cdata[test]]>')) expect(tokenizeAndHumanizeSourceSpans('<![cdata[t\ne\rs\r\nt]]>'))
.toEqual([ .toEqual([
[HtmlTokenType.CDATA_START, '<![cdata['], [HtmlTokenType.CDATA_START, '<![cdata['],
[HtmlTokenType.RAW_TEXT, 'test'], [HtmlTokenType.RAW_TEXT, 't\ne\rs\r\nt'],
[HtmlTokenType.CDATA_END, ']]>'], [HtmlTokenType.CDATA_END, ']]>'],
[HtmlTokenType.EOF, ''] [HtmlTokenType.EOF, '']
]); ]);
@ -301,6 +313,17 @@ export function main() {
]); ]);
}); });
it('should parse values with CR and LF', () => {
expect(tokenizeAndHumanizeParts("<t a='t\ne\rs\r\nt'>"))
.toEqual([
[HtmlTokenType.TAG_OPEN_START, null, 't'],
[HtmlTokenType.ATTR_NAME, null, 'a'],
[HtmlTokenType.ATTR_VALUE, 't\ne\ns\nt'],
[HtmlTokenType.TAG_OPEN_END],
[HtmlTokenType.EOF]
]);
});
it('should store the locations', () => { it('should store the locations', () => {
expect(tokenizeAndHumanizeSourceSpans('<t a=b>')) expect(tokenizeAndHumanizeSourceSpans('<t a=b>'))
.toEqual([ .toEqual([
@ -406,6 +429,11 @@ export function main() {
.toEqual([[HtmlTokenType.TEXT, 'a'], [HtmlTokenType.EOF]]); .toEqual([[HtmlTokenType.TEXT, 'a'], [HtmlTokenType.EOF]]);
}); });
it('should handle CR & LF', () => {
expect(tokenizeAndHumanizeParts('t\ne\rs\r\nt'))
.toEqual([[HtmlTokenType.TEXT, 't\ne\ns\nt'], [HtmlTokenType.EOF]]);
});
it('should parse entities', () => { it('should parse entities', () => {
expect(tokenizeAndHumanizeParts('a&amp;b')) expect(tokenizeAndHumanizeParts('a&amp;b'))
.toEqual([[HtmlTokenType.TEXT, 'a&b'], [HtmlTokenType.EOF]]); .toEqual([[HtmlTokenType.TEXT, 'a&b'], [HtmlTokenType.EOF]]);
@ -424,11 +452,11 @@ export function main() {
describe('raw text', () => { describe('raw text', () => {
it('should parse text', () => { it('should parse text', () => {
expect(tokenizeAndHumanizeParts(`<script>a</script>`)) expect(tokenizeAndHumanizeParts(`<script>t\ne\rs\r\nt</script>`))
.toEqual([ .toEqual([
[HtmlTokenType.TAG_OPEN_START, null, 'script'], [HtmlTokenType.TAG_OPEN_START, null, 'script'],
[HtmlTokenType.TAG_OPEN_END], [HtmlTokenType.TAG_OPEN_END],
[HtmlTokenType.RAW_TEXT, 'a'], [HtmlTokenType.RAW_TEXT, 't\ne\ns\nt'],
[HtmlTokenType.TAG_CLOSE, null, 'script'], [HtmlTokenType.TAG_CLOSE, null, 'script'],
[HtmlTokenType.EOF] [HtmlTokenType.EOF]
]); ]);
@ -482,11 +510,11 @@ export function main() {
describe('escapable raw text', () => { describe('escapable raw text', () => {
it('should parse text', () => { it('should parse text', () => {
expect(tokenizeAndHumanizeParts(`<title>a</title>`)) expect(tokenizeAndHumanizeParts(`<title>t\ne\rs\r\nt</title>`))
.toEqual([ .toEqual([
[HtmlTokenType.TAG_OPEN_START, null, 'title'], [HtmlTokenType.TAG_OPEN_START, null, 'title'],
[HtmlTokenType.TAG_OPEN_END], [HtmlTokenType.TAG_OPEN_END],
[HtmlTokenType.ESCAPABLE_RAW_TEXT, 'a'], [HtmlTokenType.ESCAPABLE_RAW_TEXT, 't\ne\ns\nt'],
[HtmlTokenType.TAG_CLOSE, null, 'title'], [HtmlTokenType.TAG_CLOSE, null, 'title'],
[HtmlTokenType.EOF] [HtmlTokenType.EOF]
]); ]);