fix(HtmlLexer): fix for unicode chars

fixes #6036
Closes #6061
This commit is contained in:
Victor Berchet 2015-12-21 11:32:58 -08:00
parent df3074fdfe
commit a24ee6add4
2 changed files with 99 additions and 61 deletions

View File

@ -73,10 +73,13 @@ const $LT = 60;
const $EQ = 61;
const $GT = 62;
const $QUESTION = 63;
const $A = 65;
const $Z = 90;
const $LBRACKET = 91;
const $RBRACKET = 93;
const $A = 65;
const $F = 70;
const $X = 88;
const $Z = 90;
const $a = 97;
const $f = 102;
const $z = 122;
@ -102,7 +105,6 @@ class ControlFlowError {
// See http://www.w3.org/TR/html51/syntax.html#writing
class _HtmlTokenizer {
private input: string;
private inputLowercase: string;
private length: number;
// Note: this is always lowercase!
private peek: number = -1;
@ -117,7 +119,6 @@ class _HtmlTokenizer {
constructor(private file: ParseSourceFile) {
this.input = file.content;
this.inputLowercase = file.content.toLowerCase();
this.length = file.content.length;
this._advance();
}
@ -133,16 +134,16 @@ class _HtmlTokenizer {
while (this.peek !== $EOF) {
var start = this._getLocation();
try {
if (this._attemptChar($LT)) {
if (this._attemptChar($BANG)) {
if (this._attemptChar($LBRACKET)) {
if (this._attemptCharCode($LT)) {
if (this._attemptCharCode($BANG)) {
if (this._attemptCharCode($LBRACKET)) {
this._consumeCdata(start);
} else if (this._attemptChar($MINUS)) {
} else if (this._attemptCharCode($MINUS)) {
this._consumeComment(start);
} else {
this._consumeDocType(start);
}
} else if (this._attemptChar($SLASH)) {
} else if (this._attemptCharCode($SLASH)) {
this._consumeTagClose(start);
} else {
this._consumeTagOpen(start);
@ -205,11 +206,10 @@ class _HtmlTokenizer {
this.column++;
}
this.index++;
this.peek = this.index >= this.length ? $EOF : StringWrapper.charCodeAt(this.inputLowercase,
this.index);
this.peek = this.index >= this.length ? $EOF : StringWrapper.charCodeAt(this.input, this.index);
}
private _attemptChar(charCode: number): boolean {
private _attemptCharCode(charCode: number): boolean {
if (this.peek === charCode) {
this._advance();
return true;
@ -217,38 +217,55 @@ class _HtmlTokenizer {
return false;
}
private _requireChar(charCode: number) {
private _attemptCharCodeCaseInsensitive(charCode: number): boolean {
if (compareCharCodeCaseInsensitive(this.peek, charCode)) {
this._advance();
return true;
}
return false;
}
private _requireCharCode(charCode: number) {
var location = this._getLocation();
if (!this._attemptChar(charCode)) {
if (!this._attemptCharCode(charCode)) {
throw this._createError(unexpectedCharacterErrorMsg(this.peek), location);
}
}
private _attemptChars(chars: string): boolean {
private _attemptStr(chars: string): boolean {
for (var i = 0; i < chars.length; i++) {
if (!this._attemptChar(StringWrapper.charCodeAt(chars, i))) {
if (!this._attemptCharCode(StringWrapper.charCodeAt(chars, i))) {
return false;
}
}
return true;
}
private _requireChars(chars: string) {
private _attemptStrCaseInsensitive(chars: string): boolean {
for (var i = 0; i < chars.length; i++) {
if (!this._attemptCharCodeCaseInsensitive(StringWrapper.charCodeAt(chars, i))) {
return false;
}
}
return true;
}
private _requireStr(chars: string) {
var location = this._getLocation();
if (!this._attemptChars(chars)) {
if (!this._attemptStr(chars)) {
throw this._createError(unexpectedCharacterErrorMsg(this.peek), location);
}
}
private _attemptUntilFn(predicate: Function) {
private _attemptCharCodeUntilFn(predicate: Function) {
while (!predicate(this.peek)) {
this._advance();
}
}
private _requireUntilFn(predicate: Function, len: number) {
private _requireCharCodeUntilFn(predicate: Function, len: number) {
var start = this._getLocation();
this._attemptUntilFn(predicate);
this._attemptCharCodeUntilFn(predicate);
if (this.index - start.offset < len) {
throw this._createError(unexpectedCharacterErrorMsg(this.peek), start);
}
@ -273,10 +290,10 @@ class _HtmlTokenizer {
private _decodeEntity(): string {
var start = this._getLocation();
this._advance();
if (this._attemptChar($HASH)) {
let isHex = this._attemptChar($x);
if (this._attemptCharCode($HASH)) {
let isHex = this._attemptCharCode($x) || this._attemptCharCode($X);
let numberStart = this._getLocation().offset;
this._attemptUntilFn(isDigitEntityEnd);
this._attemptCharCodeUntilFn(isDigitEntityEnd);
if (this.peek != $SEMICOLON) {
throw this._createError(unexpectedCharacterErrorMsg(this.peek), this._getLocation());
}
@ -291,7 +308,7 @@ class _HtmlTokenizer {
}
} else {
let startPosition = this._savePosition();
this._attemptUntilFn(isNamedEntityEnd);
this._attemptCharCodeUntilFn(isNamedEntityEnd);
if (this.peek != $SEMICOLON) {
this._restorePosition(startPosition);
return '&';
@ -315,7 +332,7 @@ class _HtmlTokenizer {
var parts = [];
while (true) {
tagCloseStart = this._getLocation();
if (this._attemptChar(firstCharOfEnd) && attemptEndRest()) {
if (this._attemptCharCode(firstCharOfEnd) && attemptEndRest()) {
break;
}
if (this.index > tagCloseStart.offset) {
@ -330,18 +347,18 @@ class _HtmlTokenizer {
private _consumeComment(start: ParseLocation) {
this._beginToken(HtmlTokenType.COMMENT_START, start);
this._requireChar($MINUS);
this._requireCharCode($MINUS);
this._endToken([]);
var textToken = this._consumeRawText(false, $MINUS, () => this._attemptChars('->'));
var textToken = this._consumeRawText(false, $MINUS, () => this._attemptStr('->'));
this._beginToken(HtmlTokenType.COMMENT_END, textToken.sourceSpan.end);
this._endToken([]);
}
private _consumeCdata(start: ParseLocation) {
this._beginToken(HtmlTokenType.CDATA_START, start);
this._requireChars('cdata[');
this._requireStr('CDATA[');
this._endToken([]);
var textToken = this._consumeRawText(false, $RBRACKET, () => this._attemptChars(']>'));
var textToken = this._consumeRawText(false, $RBRACKET, () => this._attemptStr(']>'));
this._beginToken(HtmlTokenType.CDATA_END, textToken.sourceSpan.end);
this._endToken([]);
}
@ -367,7 +384,7 @@ class _HtmlTokenizer {
} else {
nameStart = nameOrPrefixStart;
}
this._requireUntilFn(isNameEnd, this.index === nameStart ? 1 : 0);
this._requireCharCodeUntilFn(isNameEnd, this.index === nameStart ? 1 : 0);
var name = this.input.substring(nameStart, this.index);
return [prefix, name];
}
@ -381,16 +398,16 @@ class _HtmlTokenizer {
}
var nameStart = this.index;
this._consumeTagOpenStart(start);
lowercaseTagName = this.inputLowercase.substring(nameStart, this.index);
this._attemptUntilFn(isNotWhitespace);
lowercaseTagName = this.input.substring(nameStart, this.index).toLowerCase();
this._attemptCharCodeUntilFn(isNotWhitespace);
while (this.peek !== $SLASH && this.peek !== $GT) {
this._consumeAttributeName();
this._attemptUntilFn(isNotWhitespace);
if (this._attemptChar($EQ)) {
this._attemptUntilFn(isNotWhitespace);
this._attemptCharCodeUntilFn(isNotWhitespace);
if (this._attemptCharCode($EQ)) {
this._attemptCharCodeUntilFn(isNotWhitespace);
this._consumeAttributeValue();
}
this._attemptUntilFn(isNotWhitespace);
this._attemptCharCodeUntilFn(isNotWhitespace);
}
this._consumeTagOpenEnd();
} catch (e) {
@ -416,11 +433,11 @@ class _HtmlTokenizer {
private _consumeRawTextWithTagClose(lowercaseTagName: string, decodeEntities: boolean) {
var textToken = this._consumeRawText(decodeEntities, $LT, () => {
if (!this._attemptChar($SLASH)) return false;
this._attemptUntilFn(isNotWhitespace);
if (!this._attemptChars(lowercaseTagName)) return false;
this._attemptUntilFn(isNotWhitespace);
if (!this._attemptChar($GT)) return false;
if (!this._attemptCharCode($SLASH)) return false;
this._attemptCharCodeUntilFn(isNotWhitespace);
if (!this._attemptStrCaseInsensitive(lowercaseTagName)) return false;
this._attemptCharCodeUntilFn(isNotWhitespace);
if (!this._attemptCharCode($GT)) return false;
return true;
});
this._beginToken(HtmlTokenType.TAG_CLOSE, textToken.sourceSpan.end);
@ -453,27 +470,27 @@ class _HtmlTokenizer {
this._advance();
} else {
var valueStart = this.index;
this._requireUntilFn(isNameEnd, 1);
this._requireCharCodeUntilFn(isNameEnd, 1);
value = this.input.substring(valueStart, this.index);
}
this._endToken([this._processCarriageReturns(value)]);
}
private _consumeTagOpenEnd() {
var tokenType =
this._attemptChar($SLASH) ? HtmlTokenType.TAG_OPEN_END_VOID : HtmlTokenType.TAG_OPEN_END;
var tokenType = this._attemptCharCode($SLASH) ? HtmlTokenType.TAG_OPEN_END_VOID :
HtmlTokenType.TAG_OPEN_END;
this._beginToken(tokenType);
this._requireChar($GT);
this._requireCharCode($GT);
this._endToken([]);
}
private _consumeTagClose(start: ParseLocation) {
this._beginToken(HtmlTokenType.TAG_CLOSE, start);
this._attemptUntilFn(isNotWhitespace);
this._attemptCharCodeUntilFn(isNotWhitespace);
var prefixAndName;
prefixAndName = this._consumePrefixAndName();
this._attemptUntilFn(isNotWhitespace);
this._requireChar($GT);
this._attemptCharCodeUntilFn(isNotWhitespace);
this._requireCharCode($GT);
this._endToken(prefixAndName);
}
@ -534,11 +551,19 @@ function isTextEnd(code: number): boolean {
}
function isAsciiLetter(code: number): boolean {
return code >= $a && code <= $z;
return code >= $a && code <= $z || code >= $A && code <= $Z;
}
function isAsciiHexDigit(code: number): boolean {
return code >= $a && code <= $f || code >= $0 && code <= $9;
return code >= $a && code <= $f || code >= $A && code <= $F || code >= $0 && code <= $9;
}
function compareCharCodeCaseInsensitive(code1: number, code2: number): boolean {
return toUpperCaseCharCode(code1) == toUpperCaseCharCode(code2);
}
function toUpperCaseCharCode(code: number): number {
return code >= $a && code <= $z ? code - $a + $A : code;
}
function mergeTextTokens(srcTokens: HtmlToken[]): HtmlToken[] {

View File

@ -114,9 +114,9 @@ export function main() {
});
});
describe('cdata', () => {
it('should parse cdata', () => {
expect(tokenizeAndHumanizeParts('<![cdata[t\ne\rs\r\nt]]>'))
describe('CDATA', () => {
it('should parse CDATA', () => {
expect(tokenizeAndHumanizeParts('<![CDATA[t\ne\rs\r\nt]]>'))
.toEqual([
[HtmlTokenType.CDATA_START],
[HtmlTokenType.RAW_TEXT, 't\ne\ns\nt'],
@ -126,22 +126,22 @@ export function main() {
});
it('should store the locations', () => {
expect(tokenizeAndHumanizeSourceSpans('<![cdata[t\ne\rs\r\nt]]>'))
expect(tokenizeAndHumanizeSourceSpans('<![CDATA[t\ne\rs\r\nt]]>'))
.toEqual([
[HtmlTokenType.CDATA_START, '<![cdata['],
[HtmlTokenType.CDATA_START, '<![CDATA['],
[HtmlTokenType.RAW_TEXT, 't\ne\rs\r\nt'],
[HtmlTokenType.CDATA_END, ']]>'],
[HtmlTokenType.EOF, '']
]);
});
it('should report <![ without cdata[', () => {
it('should report <![ without CDATA[', () => {
expect(tokenizeAndHumanizeErrors('<![a'))
.toEqual([[HtmlTokenType.CDATA_START, 'Unexpected character "a"', '0:3']]);
});
it('should report missing end cdata', () => {
expect(tokenizeAndHumanizeErrors('<![cdata['))
expect(tokenizeAndHumanizeErrors('<![CDATA['))
.toEqual([[HtmlTokenType.RAW_TEXT, 'Unexpected character "EOF"', '0:9']]);
});
});
@ -367,8 +367,8 @@ export function main() {
});
it('should parse hexadecimal entities', () => {
expect(tokenizeAndHumanizeParts('&#x41;'))
.toEqual([[HtmlTokenType.TEXT, 'A'], [HtmlTokenType.EOF]]);
expect(tokenizeAndHumanizeParts('&#x41;&#X41;'))
.toEqual([[HtmlTokenType.TEXT, 'AA'], [HtmlTokenType.EOF]]);
});
it('should parse decimal entities', () => {
@ -473,7 +473,7 @@ export function main() {
});
it('should not detect entities', () => {
expect(tokenizeAndHumanizeParts(`<script>&amp;</script>`))
expect(tokenizeAndHumanizeParts(`<script>&amp;</SCRIPT>`))
.toEqual([
[HtmlTokenType.TAG_OPEN_START, null, 'script'],
[HtmlTokenType.TAG_OPEN_END],
@ -587,6 +587,19 @@ export function main() {
});
});
describe('unicode characters', () => {
it('should support unicode characters', () => {
expect(tokenizeAndHumanizeSourceSpans(`<p>İ</p>`))
.toEqual([
[HtmlTokenType.TAG_OPEN_START, '<p'],
[HtmlTokenType.TAG_OPEN_END, '>'],
[HtmlTokenType.TEXT, 'İ'],
[HtmlTokenType.TAG_CLOSE, '</p>'],
[HtmlTokenType.EOF, '']
]);
});
});
});
}