fix(HtmlLexer): fix for unicode chars

fixes #6036
Closes #6061
This commit is contained in:
Victor Berchet 2015-12-21 11:32:58 -08:00
parent df3074fdfe
commit a24ee6add4
2 changed files with 99 additions and 61 deletions

View File

@ -73,10 +73,13 @@ const $LT = 60;
const $EQ = 61; const $EQ = 61;
const $GT = 62; const $GT = 62;
const $QUESTION = 63; const $QUESTION = 63;
const $A = 65;
const $Z = 90;
const $LBRACKET = 91; const $LBRACKET = 91;
const $RBRACKET = 93; const $RBRACKET = 93;
const $A = 65;
const $F = 70;
const $X = 88;
const $Z = 90;
const $a = 97; const $a = 97;
const $f = 102; const $f = 102;
const $z = 122; const $z = 122;
@ -102,7 +105,6 @@ class ControlFlowError {
// See http://www.w3.org/TR/html51/syntax.html#writing // See http://www.w3.org/TR/html51/syntax.html#writing
class _HtmlTokenizer { class _HtmlTokenizer {
private input: string; private input: string;
private inputLowercase: string;
private length: number; private length: number;
// Note: this is always lowercase! // Note: this is always lowercase!
private peek: number = -1; private peek: number = -1;
@ -117,7 +119,6 @@ class _HtmlTokenizer {
constructor(private file: ParseSourceFile) { constructor(private file: ParseSourceFile) {
this.input = file.content; this.input = file.content;
this.inputLowercase = file.content.toLowerCase();
this.length = file.content.length; this.length = file.content.length;
this._advance(); this._advance();
} }
@ -133,16 +134,16 @@ class _HtmlTokenizer {
while (this.peek !== $EOF) { while (this.peek !== $EOF) {
var start = this._getLocation(); var start = this._getLocation();
try { try {
if (this._attemptChar($LT)) { if (this._attemptCharCode($LT)) {
if (this._attemptChar($BANG)) { if (this._attemptCharCode($BANG)) {
if (this._attemptChar($LBRACKET)) { if (this._attemptCharCode($LBRACKET)) {
this._consumeCdata(start); this._consumeCdata(start);
} else if (this._attemptChar($MINUS)) { } else if (this._attemptCharCode($MINUS)) {
this._consumeComment(start); this._consumeComment(start);
} else { } else {
this._consumeDocType(start); this._consumeDocType(start);
} }
} else if (this._attemptChar($SLASH)) { } else if (this._attemptCharCode($SLASH)) {
this._consumeTagClose(start); this._consumeTagClose(start);
} else { } else {
this._consumeTagOpen(start); this._consumeTagOpen(start);
@ -205,11 +206,10 @@ class _HtmlTokenizer {
this.column++; this.column++;
} }
this.index++; this.index++;
this.peek = this.index >= this.length ? $EOF : StringWrapper.charCodeAt(this.inputLowercase, this.peek = this.index >= this.length ? $EOF : StringWrapper.charCodeAt(this.input, this.index);
this.index);
} }
private _attemptChar(charCode: number): boolean { private _attemptCharCode(charCode: number): boolean {
if (this.peek === charCode) { if (this.peek === charCode) {
this._advance(); this._advance();
return true; return true;
@ -217,38 +217,55 @@ class _HtmlTokenizer {
return false; return false;
} }
private _requireChar(charCode: number) { private _attemptCharCodeCaseInsensitive(charCode: number): boolean {
if (compareCharCodeCaseInsensitive(this.peek, charCode)) {
this._advance();
return true;
}
return false;
}
private _requireCharCode(charCode: number) {
var location = this._getLocation(); var location = this._getLocation();
if (!this._attemptChar(charCode)) { if (!this._attemptCharCode(charCode)) {
throw this._createError(unexpectedCharacterErrorMsg(this.peek), location); throw this._createError(unexpectedCharacterErrorMsg(this.peek), location);
} }
} }
private _attemptChars(chars: string): boolean { private _attemptStr(chars: string): boolean {
for (var i = 0; i < chars.length; i++) { for (var i = 0; i < chars.length; i++) {
if (!this._attemptChar(StringWrapper.charCodeAt(chars, i))) { if (!this._attemptCharCode(StringWrapper.charCodeAt(chars, i))) {
return false; return false;
} }
} }
return true; return true;
} }
private _requireChars(chars: string) { private _attemptStrCaseInsensitive(chars: string): boolean {
for (var i = 0; i < chars.length; i++) {
if (!this._attemptCharCodeCaseInsensitive(StringWrapper.charCodeAt(chars, i))) {
return false;
}
}
return true;
}
private _requireStr(chars: string) {
var location = this._getLocation(); var location = this._getLocation();
if (!this._attemptChars(chars)) { if (!this._attemptStr(chars)) {
throw this._createError(unexpectedCharacterErrorMsg(this.peek), location); throw this._createError(unexpectedCharacterErrorMsg(this.peek), location);
} }
} }
private _attemptUntilFn(predicate: Function) { private _attemptCharCodeUntilFn(predicate: Function) {
while (!predicate(this.peek)) { while (!predicate(this.peek)) {
this._advance(); this._advance();
} }
} }
private _requireUntilFn(predicate: Function, len: number) { private _requireCharCodeUntilFn(predicate: Function, len: number) {
var start = this._getLocation(); var start = this._getLocation();
this._attemptUntilFn(predicate); this._attemptCharCodeUntilFn(predicate);
if (this.index - start.offset < len) { if (this.index - start.offset < len) {
throw this._createError(unexpectedCharacterErrorMsg(this.peek), start); throw this._createError(unexpectedCharacterErrorMsg(this.peek), start);
} }
@ -273,10 +290,10 @@ class _HtmlTokenizer {
private _decodeEntity(): string { private _decodeEntity(): string {
var start = this._getLocation(); var start = this._getLocation();
this._advance(); this._advance();
if (this._attemptChar($HASH)) { if (this._attemptCharCode($HASH)) {
let isHex = this._attemptChar($x); let isHex = this._attemptCharCode($x) || this._attemptCharCode($X);
let numberStart = this._getLocation().offset; let numberStart = this._getLocation().offset;
this._attemptUntilFn(isDigitEntityEnd); this._attemptCharCodeUntilFn(isDigitEntityEnd);
if (this.peek != $SEMICOLON) { if (this.peek != $SEMICOLON) {
throw this._createError(unexpectedCharacterErrorMsg(this.peek), this._getLocation()); throw this._createError(unexpectedCharacterErrorMsg(this.peek), this._getLocation());
} }
@ -291,7 +308,7 @@ class _HtmlTokenizer {
} }
} else { } else {
let startPosition = this._savePosition(); let startPosition = this._savePosition();
this._attemptUntilFn(isNamedEntityEnd); this._attemptCharCodeUntilFn(isNamedEntityEnd);
if (this.peek != $SEMICOLON) { if (this.peek != $SEMICOLON) {
this._restorePosition(startPosition); this._restorePosition(startPosition);
return '&'; return '&';
@ -315,7 +332,7 @@ class _HtmlTokenizer {
var parts = []; var parts = [];
while (true) { while (true) {
tagCloseStart = this._getLocation(); tagCloseStart = this._getLocation();
if (this._attemptChar(firstCharOfEnd) && attemptEndRest()) { if (this._attemptCharCode(firstCharOfEnd) && attemptEndRest()) {
break; break;
} }
if (this.index > tagCloseStart.offset) { if (this.index > tagCloseStart.offset) {
@ -330,18 +347,18 @@ class _HtmlTokenizer {
private _consumeComment(start: ParseLocation) { private _consumeComment(start: ParseLocation) {
this._beginToken(HtmlTokenType.COMMENT_START, start); this._beginToken(HtmlTokenType.COMMENT_START, start);
this._requireChar($MINUS); this._requireCharCode($MINUS);
this._endToken([]); this._endToken([]);
var textToken = this._consumeRawText(false, $MINUS, () => this._attemptChars('->')); var textToken = this._consumeRawText(false, $MINUS, () => this._attemptStr('->'));
this._beginToken(HtmlTokenType.COMMENT_END, textToken.sourceSpan.end); this._beginToken(HtmlTokenType.COMMENT_END, textToken.sourceSpan.end);
this._endToken([]); this._endToken([]);
} }
private _consumeCdata(start: ParseLocation) { private _consumeCdata(start: ParseLocation) {
this._beginToken(HtmlTokenType.CDATA_START, start); this._beginToken(HtmlTokenType.CDATA_START, start);
this._requireChars('cdata['); this._requireStr('CDATA[');
this._endToken([]); this._endToken([]);
var textToken = this._consumeRawText(false, $RBRACKET, () => this._attemptChars(']>')); var textToken = this._consumeRawText(false, $RBRACKET, () => this._attemptStr(']>'));
this._beginToken(HtmlTokenType.CDATA_END, textToken.sourceSpan.end); this._beginToken(HtmlTokenType.CDATA_END, textToken.sourceSpan.end);
this._endToken([]); this._endToken([]);
} }
@ -367,7 +384,7 @@ class _HtmlTokenizer {
} else { } else {
nameStart = nameOrPrefixStart; nameStart = nameOrPrefixStart;
} }
this._requireUntilFn(isNameEnd, this.index === nameStart ? 1 : 0); this._requireCharCodeUntilFn(isNameEnd, this.index === nameStart ? 1 : 0);
var name = this.input.substring(nameStart, this.index); var name = this.input.substring(nameStart, this.index);
return [prefix, name]; return [prefix, name];
} }
@ -381,16 +398,16 @@ class _HtmlTokenizer {
} }
var nameStart = this.index; var nameStart = this.index;
this._consumeTagOpenStart(start); this._consumeTagOpenStart(start);
lowercaseTagName = this.inputLowercase.substring(nameStart, this.index); lowercaseTagName = this.input.substring(nameStart, this.index).toLowerCase();
this._attemptUntilFn(isNotWhitespace); this._attemptCharCodeUntilFn(isNotWhitespace);
while (this.peek !== $SLASH && this.peek !== $GT) { while (this.peek !== $SLASH && this.peek !== $GT) {
this._consumeAttributeName(); this._consumeAttributeName();
this._attemptUntilFn(isNotWhitespace); this._attemptCharCodeUntilFn(isNotWhitespace);
if (this._attemptChar($EQ)) { if (this._attemptCharCode($EQ)) {
this._attemptUntilFn(isNotWhitespace); this._attemptCharCodeUntilFn(isNotWhitespace);
this._consumeAttributeValue(); this._consumeAttributeValue();
} }
this._attemptUntilFn(isNotWhitespace); this._attemptCharCodeUntilFn(isNotWhitespace);
} }
this._consumeTagOpenEnd(); this._consumeTagOpenEnd();
} catch (e) { } catch (e) {
@ -416,11 +433,11 @@ class _HtmlTokenizer {
private _consumeRawTextWithTagClose(lowercaseTagName: string, decodeEntities: boolean) { private _consumeRawTextWithTagClose(lowercaseTagName: string, decodeEntities: boolean) {
var textToken = this._consumeRawText(decodeEntities, $LT, () => { var textToken = this._consumeRawText(decodeEntities, $LT, () => {
if (!this._attemptChar($SLASH)) return false; if (!this._attemptCharCode($SLASH)) return false;
this._attemptUntilFn(isNotWhitespace); this._attemptCharCodeUntilFn(isNotWhitespace);
if (!this._attemptChars(lowercaseTagName)) return false; if (!this._attemptStrCaseInsensitive(lowercaseTagName)) return false;
this._attemptUntilFn(isNotWhitespace); this._attemptCharCodeUntilFn(isNotWhitespace);
if (!this._attemptChar($GT)) return false; if (!this._attemptCharCode($GT)) return false;
return true; return true;
}); });
this._beginToken(HtmlTokenType.TAG_CLOSE, textToken.sourceSpan.end); this._beginToken(HtmlTokenType.TAG_CLOSE, textToken.sourceSpan.end);
@ -453,27 +470,27 @@ class _HtmlTokenizer {
this._advance(); this._advance();
} else { } else {
var valueStart = this.index; var valueStart = this.index;
this._requireUntilFn(isNameEnd, 1); this._requireCharCodeUntilFn(isNameEnd, 1);
value = this.input.substring(valueStart, this.index); value = this.input.substring(valueStart, this.index);
} }
this._endToken([this._processCarriageReturns(value)]); this._endToken([this._processCarriageReturns(value)]);
} }
private _consumeTagOpenEnd() { private _consumeTagOpenEnd() {
var tokenType = var tokenType = this._attemptCharCode($SLASH) ? HtmlTokenType.TAG_OPEN_END_VOID :
this._attemptChar($SLASH) ? HtmlTokenType.TAG_OPEN_END_VOID : HtmlTokenType.TAG_OPEN_END; HtmlTokenType.TAG_OPEN_END;
this._beginToken(tokenType); this._beginToken(tokenType);
this._requireChar($GT); this._requireCharCode($GT);
this._endToken([]); this._endToken([]);
} }
private _consumeTagClose(start: ParseLocation) { private _consumeTagClose(start: ParseLocation) {
this._beginToken(HtmlTokenType.TAG_CLOSE, start); this._beginToken(HtmlTokenType.TAG_CLOSE, start);
this._attemptUntilFn(isNotWhitespace); this._attemptCharCodeUntilFn(isNotWhitespace);
var prefixAndName; var prefixAndName;
prefixAndName = this._consumePrefixAndName(); prefixAndName = this._consumePrefixAndName();
this._attemptUntilFn(isNotWhitespace); this._attemptCharCodeUntilFn(isNotWhitespace);
this._requireChar($GT); this._requireCharCode($GT);
this._endToken(prefixAndName); this._endToken(prefixAndName);
} }
@ -534,11 +551,19 @@ function isTextEnd(code: number): boolean {
} }
function isAsciiLetter(code: number): boolean { function isAsciiLetter(code: number): boolean {
return code >= $a && code <= $z; return code >= $a && code <= $z || code >= $A && code <= $Z;
} }
function isAsciiHexDigit(code: number): boolean { function isAsciiHexDigit(code: number): boolean {
return code >= $a && code <= $f || code >= $0 && code <= $9; return code >= $a && code <= $f || code >= $A && code <= $F || code >= $0 && code <= $9;
}
function compareCharCodeCaseInsensitive(code1: number, code2: number): boolean {
return toUpperCaseCharCode(code1) == toUpperCaseCharCode(code2);
}
function toUpperCaseCharCode(code: number): number {
return code >= $a && code <= $z ? code - $a + $A : code;
} }
function mergeTextTokens(srcTokens: HtmlToken[]): HtmlToken[] { function mergeTextTokens(srcTokens: HtmlToken[]): HtmlToken[] {

View File

@ -114,9 +114,9 @@ export function main() {
}); });
}); });
describe('cdata', () => { describe('CDATA', () => {
it('should parse cdata', () => { it('should parse CDATA', () => {
expect(tokenizeAndHumanizeParts('<![cdata[t\ne\rs\r\nt]]>')) expect(tokenizeAndHumanizeParts('<![CDATA[t\ne\rs\r\nt]]>'))
.toEqual([ .toEqual([
[HtmlTokenType.CDATA_START], [HtmlTokenType.CDATA_START],
[HtmlTokenType.RAW_TEXT, 't\ne\ns\nt'], [HtmlTokenType.RAW_TEXT, 't\ne\ns\nt'],
@ -126,22 +126,22 @@ export function main() {
}); });
it('should store the locations', () => { it('should store the locations', () => {
expect(tokenizeAndHumanizeSourceSpans('<![cdata[t\ne\rs\r\nt]]>')) expect(tokenizeAndHumanizeSourceSpans('<![CDATA[t\ne\rs\r\nt]]>'))
.toEqual([ .toEqual([
[HtmlTokenType.CDATA_START, '<![cdata['], [HtmlTokenType.CDATA_START, '<![CDATA['],
[HtmlTokenType.RAW_TEXT, 't\ne\rs\r\nt'], [HtmlTokenType.RAW_TEXT, 't\ne\rs\r\nt'],
[HtmlTokenType.CDATA_END, ']]>'], [HtmlTokenType.CDATA_END, ']]>'],
[HtmlTokenType.EOF, ''] [HtmlTokenType.EOF, '']
]); ]);
}); });
it('should report <![ without cdata[', () => { it('should report <![ without CDATA[', () => {
expect(tokenizeAndHumanizeErrors('<![a')) expect(tokenizeAndHumanizeErrors('<![a'))
.toEqual([[HtmlTokenType.CDATA_START, 'Unexpected character "a"', '0:3']]); .toEqual([[HtmlTokenType.CDATA_START, 'Unexpected character "a"', '0:3']]);
}); });
it('should report missing end cdata', () => { it('should report missing end cdata', () => {
expect(tokenizeAndHumanizeErrors('<![cdata[')) expect(tokenizeAndHumanizeErrors('<![CDATA['))
.toEqual([[HtmlTokenType.RAW_TEXT, 'Unexpected character "EOF"', '0:9']]); .toEqual([[HtmlTokenType.RAW_TEXT, 'Unexpected character "EOF"', '0:9']]);
}); });
}); });
@ -367,8 +367,8 @@ export function main() {
}); });
it('should parse hexadecimal entities', () => { it('should parse hexadecimal entities', () => {
expect(tokenizeAndHumanizeParts('&#x41;')) expect(tokenizeAndHumanizeParts('&#x41;&#X41;'))
.toEqual([[HtmlTokenType.TEXT, 'A'], [HtmlTokenType.EOF]]); .toEqual([[HtmlTokenType.TEXT, 'AA'], [HtmlTokenType.EOF]]);
}); });
it('should parse decimal entities', () => { it('should parse decimal entities', () => {
@ -473,7 +473,7 @@ export function main() {
}); });
it('should not detect entities', () => { it('should not detect entities', () => {
expect(tokenizeAndHumanizeParts(`<script>&amp;</script>`)) expect(tokenizeAndHumanizeParts(`<script>&amp;</SCRIPT>`))
.toEqual([ .toEqual([
[HtmlTokenType.TAG_OPEN_START, null, 'script'], [HtmlTokenType.TAG_OPEN_START, null, 'script'],
[HtmlTokenType.TAG_OPEN_END], [HtmlTokenType.TAG_OPEN_END],
@ -587,6 +587,19 @@ export function main() {
}); });
}); });
describe('unicode characters', () => {
it('should support unicode characters', () => {
expect(tokenizeAndHumanizeSourceSpans(`<p>İ</p>`))
.toEqual([
[HtmlTokenType.TAG_OPEN_START, '<p'],
[HtmlTokenType.TAG_OPEN_END, '>'],
[HtmlTokenType.TEXT, 'İ'],
[HtmlTokenType.TAG_CLOSE, '</p>'],
[HtmlTokenType.EOF, '']
]);
});
});
}); });
} }