parent
df3074fdfe
commit
a24ee6add4
|
@ -73,10 +73,13 @@ const $LT = 60;
|
|||
const $EQ = 61;
|
||||
const $GT = 62;
|
||||
const $QUESTION = 63;
|
||||
const $A = 65;
|
||||
const $Z = 90;
|
||||
const $LBRACKET = 91;
|
||||
const $RBRACKET = 93;
|
||||
const $A = 65;
|
||||
const $F = 70;
|
||||
const $X = 88;
|
||||
const $Z = 90;
|
||||
|
||||
const $a = 97;
|
||||
const $f = 102;
|
||||
const $z = 122;
|
||||
|
@ -102,7 +105,6 @@ class ControlFlowError {
|
|||
// See http://www.w3.org/TR/html51/syntax.html#writing
|
||||
class _HtmlTokenizer {
|
||||
private input: string;
|
||||
private inputLowercase: string;
|
||||
private length: number;
|
||||
// Note: this is always lowercase!
|
||||
private peek: number = -1;
|
||||
|
@ -117,7 +119,6 @@ class _HtmlTokenizer {
|
|||
|
||||
constructor(private file: ParseSourceFile) {
|
||||
this.input = file.content;
|
||||
this.inputLowercase = file.content.toLowerCase();
|
||||
this.length = file.content.length;
|
||||
this._advance();
|
||||
}
|
||||
|
@ -133,16 +134,16 @@ class _HtmlTokenizer {
|
|||
while (this.peek !== $EOF) {
|
||||
var start = this._getLocation();
|
||||
try {
|
||||
if (this._attemptChar($LT)) {
|
||||
if (this._attemptChar($BANG)) {
|
||||
if (this._attemptChar($LBRACKET)) {
|
||||
if (this._attemptCharCode($LT)) {
|
||||
if (this._attemptCharCode($BANG)) {
|
||||
if (this._attemptCharCode($LBRACKET)) {
|
||||
this._consumeCdata(start);
|
||||
} else if (this._attemptChar($MINUS)) {
|
||||
} else if (this._attemptCharCode($MINUS)) {
|
||||
this._consumeComment(start);
|
||||
} else {
|
||||
this._consumeDocType(start);
|
||||
}
|
||||
} else if (this._attemptChar($SLASH)) {
|
||||
} else if (this._attemptCharCode($SLASH)) {
|
||||
this._consumeTagClose(start);
|
||||
} else {
|
||||
this._consumeTagOpen(start);
|
||||
|
@ -205,11 +206,10 @@ class _HtmlTokenizer {
|
|||
this.column++;
|
||||
}
|
||||
this.index++;
|
||||
this.peek = this.index >= this.length ? $EOF : StringWrapper.charCodeAt(this.inputLowercase,
|
||||
this.index);
|
||||
this.peek = this.index >= this.length ? $EOF : StringWrapper.charCodeAt(this.input, this.index);
|
||||
}
|
||||
|
||||
private _attemptChar(charCode: number): boolean {
|
||||
private _attemptCharCode(charCode: number): boolean {
|
||||
if (this.peek === charCode) {
|
||||
this._advance();
|
||||
return true;
|
||||
|
@ -217,38 +217,55 @@ class _HtmlTokenizer {
|
|||
return false;
|
||||
}
|
||||
|
||||
private _requireChar(charCode: number) {
|
||||
private _attemptCharCodeCaseInsensitive(charCode: number): boolean {
|
||||
if (compareCharCodeCaseInsensitive(this.peek, charCode)) {
|
||||
this._advance();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private _requireCharCode(charCode: number) {
|
||||
var location = this._getLocation();
|
||||
if (!this._attemptChar(charCode)) {
|
||||
if (!this._attemptCharCode(charCode)) {
|
||||
throw this._createError(unexpectedCharacterErrorMsg(this.peek), location);
|
||||
}
|
||||
}
|
||||
|
||||
private _attemptChars(chars: string): boolean {
|
||||
private _attemptStr(chars: string): boolean {
|
||||
for (var i = 0; i < chars.length; i++) {
|
||||
if (!this._attemptChar(StringWrapper.charCodeAt(chars, i))) {
|
||||
if (!this._attemptCharCode(StringWrapper.charCodeAt(chars, i))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private _requireChars(chars: string) {
|
||||
private _attemptStrCaseInsensitive(chars: string): boolean {
|
||||
for (var i = 0; i < chars.length; i++) {
|
||||
if (!this._attemptCharCodeCaseInsensitive(StringWrapper.charCodeAt(chars, i))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private _requireStr(chars: string) {
|
||||
var location = this._getLocation();
|
||||
if (!this._attemptChars(chars)) {
|
||||
if (!this._attemptStr(chars)) {
|
||||
throw this._createError(unexpectedCharacterErrorMsg(this.peek), location);
|
||||
}
|
||||
}
|
||||
|
||||
private _attemptUntilFn(predicate: Function) {
|
||||
private _attemptCharCodeUntilFn(predicate: Function) {
|
||||
while (!predicate(this.peek)) {
|
||||
this._advance();
|
||||
}
|
||||
}
|
||||
|
||||
private _requireUntilFn(predicate: Function, len: number) {
|
||||
private _requireCharCodeUntilFn(predicate: Function, len: number) {
|
||||
var start = this._getLocation();
|
||||
this._attemptUntilFn(predicate);
|
||||
this._attemptCharCodeUntilFn(predicate);
|
||||
if (this.index - start.offset < len) {
|
||||
throw this._createError(unexpectedCharacterErrorMsg(this.peek), start);
|
||||
}
|
||||
|
@ -273,10 +290,10 @@ class _HtmlTokenizer {
|
|||
private _decodeEntity(): string {
|
||||
var start = this._getLocation();
|
||||
this._advance();
|
||||
if (this._attemptChar($HASH)) {
|
||||
let isHex = this._attemptChar($x);
|
||||
if (this._attemptCharCode($HASH)) {
|
||||
let isHex = this._attemptCharCode($x) || this._attemptCharCode($X);
|
||||
let numberStart = this._getLocation().offset;
|
||||
this._attemptUntilFn(isDigitEntityEnd);
|
||||
this._attemptCharCodeUntilFn(isDigitEntityEnd);
|
||||
if (this.peek != $SEMICOLON) {
|
||||
throw this._createError(unexpectedCharacterErrorMsg(this.peek), this._getLocation());
|
||||
}
|
||||
|
@ -291,7 +308,7 @@ class _HtmlTokenizer {
|
|||
}
|
||||
} else {
|
||||
let startPosition = this._savePosition();
|
||||
this._attemptUntilFn(isNamedEntityEnd);
|
||||
this._attemptCharCodeUntilFn(isNamedEntityEnd);
|
||||
if (this.peek != $SEMICOLON) {
|
||||
this._restorePosition(startPosition);
|
||||
return '&';
|
||||
|
@ -315,7 +332,7 @@ class _HtmlTokenizer {
|
|||
var parts = [];
|
||||
while (true) {
|
||||
tagCloseStart = this._getLocation();
|
||||
if (this._attemptChar(firstCharOfEnd) && attemptEndRest()) {
|
||||
if (this._attemptCharCode(firstCharOfEnd) && attemptEndRest()) {
|
||||
break;
|
||||
}
|
||||
if (this.index > tagCloseStart.offset) {
|
||||
|
@ -330,18 +347,18 @@ class _HtmlTokenizer {
|
|||
|
||||
private _consumeComment(start: ParseLocation) {
|
||||
this._beginToken(HtmlTokenType.COMMENT_START, start);
|
||||
this._requireChar($MINUS);
|
||||
this._requireCharCode($MINUS);
|
||||
this._endToken([]);
|
||||
var textToken = this._consumeRawText(false, $MINUS, () => this._attemptChars('->'));
|
||||
var textToken = this._consumeRawText(false, $MINUS, () => this._attemptStr('->'));
|
||||
this._beginToken(HtmlTokenType.COMMENT_END, textToken.sourceSpan.end);
|
||||
this._endToken([]);
|
||||
}
|
||||
|
||||
private _consumeCdata(start: ParseLocation) {
|
||||
this._beginToken(HtmlTokenType.CDATA_START, start);
|
||||
this._requireChars('cdata[');
|
||||
this._requireStr('CDATA[');
|
||||
this._endToken([]);
|
||||
var textToken = this._consumeRawText(false, $RBRACKET, () => this._attemptChars(']>'));
|
||||
var textToken = this._consumeRawText(false, $RBRACKET, () => this._attemptStr(']>'));
|
||||
this._beginToken(HtmlTokenType.CDATA_END, textToken.sourceSpan.end);
|
||||
this._endToken([]);
|
||||
}
|
||||
|
@ -367,7 +384,7 @@ class _HtmlTokenizer {
|
|||
} else {
|
||||
nameStart = nameOrPrefixStart;
|
||||
}
|
||||
this._requireUntilFn(isNameEnd, this.index === nameStart ? 1 : 0);
|
||||
this._requireCharCodeUntilFn(isNameEnd, this.index === nameStart ? 1 : 0);
|
||||
var name = this.input.substring(nameStart, this.index);
|
||||
return [prefix, name];
|
||||
}
|
||||
|
@ -381,16 +398,16 @@ class _HtmlTokenizer {
|
|||
}
|
||||
var nameStart = this.index;
|
||||
this._consumeTagOpenStart(start);
|
||||
lowercaseTagName = this.inputLowercase.substring(nameStart, this.index);
|
||||
this._attemptUntilFn(isNotWhitespace);
|
||||
lowercaseTagName = this.input.substring(nameStart, this.index).toLowerCase();
|
||||
this._attemptCharCodeUntilFn(isNotWhitespace);
|
||||
while (this.peek !== $SLASH && this.peek !== $GT) {
|
||||
this._consumeAttributeName();
|
||||
this._attemptUntilFn(isNotWhitespace);
|
||||
if (this._attemptChar($EQ)) {
|
||||
this._attemptUntilFn(isNotWhitespace);
|
||||
this._attemptCharCodeUntilFn(isNotWhitespace);
|
||||
if (this._attemptCharCode($EQ)) {
|
||||
this._attemptCharCodeUntilFn(isNotWhitespace);
|
||||
this._consumeAttributeValue();
|
||||
}
|
||||
this._attemptUntilFn(isNotWhitespace);
|
||||
this._attemptCharCodeUntilFn(isNotWhitespace);
|
||||
}
|
||||
this._consumeTagOpenEnd();
|
||||
} catch (e) {
|
||||
|
@ -416,11 +433,11 @@ class _HtmlTokenizer {
|
|||
|
||||
private _consumeRawTextWithTagClose(lowercaseTagName: string, decodeEntities: boolean) {
|
||||
var textToken = this._consumeRawText(decodeEntities, $LT, () => {
|
||||
if (!this._attemptChar($SLASH)) return false;
|
||||
this._attemptUntilFn(isNotWhitespace);
|
||||
if (!this._attemptChars(lowercaseTagName)) return false;
|
||||
this._attemptUntilFn(isNotWhitespace);
|
||||
if (!this._attemptChar($GT)) return false;
|
||||
if (!this._attemptCharCode($SLASH)) return false;
|
||||
this._attemptCharCodeUntilFn(isNotWhitespace);
|
||||
if (!this._attemptStrCaseInsensitive(lowercaseTagName)) return false;
|
||||
this._attemptCharCodeUntilFn(isNotWhitespace);
|
||||
if (!this._attemptCharCode($GT)) return false;
|
||||
return true;
|
||||
});
|
||||
this._beginToken(HtmlTokenType.TAG_CLOSE, textToken.sourceSpan.end);
|
||||
|
@ -453,27 +470,27 @@ class _HtmlTokenizer {
|
|||
this._advance();
|
||||
} else {
|
||||
var valueStart = this.index;
|
||||
this._requireUntilFn(isNameEnd, 1);
|
||||
this._requireCharCodeUntilFn(isNameEnd, 1);
|
||||
value = this.input.substring(valueStart, this.index);
|
||||
}
|
||||
this._endToken([this._processCarriageReturns(value)]);
|
||||
}
|
||||
|
||||
private _consumeTagOpenEnd() {
|
||||
var tokenType =
|
||||
this._attemptChar($SLASH) ? HtmlTokenType.TAG_OPEN_END_VOID : HtmlTokenType.TAG_OPEN_END;
|
||||
var tokenType = this._attemptCharCode($SLASH) ? HtmlTokenType.TAG_OPEN_END_VOID :
|
||||
HtmlTokenType.TAG_OPEN_END;
|
||||
this._beginToken(tokenType);
|
||||
this._requireChar($GT);
|
||||
this._requireCharCode($GT);
|
||||
this._endToken([]);
|
||||
}
|
||||
|
||||
private _consumeTagClose(start: ParseLocation) {
|
||||
this._beginToken(HtmlTokenType.TAG_CLOSE, start);
|
||||
this._attemptUntilFn(isNotWhitespace);
|
||||
this._attemptCharCodeUntilFn(isNotWhitespace);
|
||||
var prefixAndName;
|
||||
prefixAndName = this._consumePrefixAndName();
|
||||
this._attemptUntilFn(isNotWhitespace);
|
||||
this._requireChar($GT);
|
||||
this._attemptCharCodeUntilFn(isNotWhitespace);
|
||||
this._requireCharCode($GT);
|
||||
this._endToken(prefixAndName);
|
||||
}
|
||||
|
||||
|
@ -534,11 +551,19 @@ function isTextEnd(code: number): boolean {
|
|||
}
|
||||
|
||||
function isAsciiLetter(code: number): boolean {
|
||||
return code >= $a && code <= $z;
|
||||
return code >= $a && code <= $z || code >= $A && code <= $Z;
|
||||
}
|
||||
|
||||
function isAsciiHexDigit(code: number): boolean {
|
||||
return code >= $a && code <= $f || code >= $0 && code <= $9;
|
||||
return code >= $a && code <= $f || code >= $A && code <= $F || code >= $0 && code <= $9;
|
||||
}
|
||||
|
||||
function compareCharCodeCaseInsensitive(code1: number, code2: number): boolean {
|
||||
return toUpperCaseCharCode(code1) == toUpperCaseCharCode(code2);
|
||||
}
|
||||
|
||||
function toUpperCaseCharCode(code: number): number {
|
||||
return code >= $a && code <= $z ? code - $a + $A : code;
|
||||
}
|
||||
|
||||
function mergeTextTokens(srcTokens: HtmlToken[]): HtmlToken[] {
|
||||
|
|
|
@ -114,9 +114,9 @@ export function main() {
|
|||
});
|
||||
});
|
||||
|
||||
describe('cdata', () => {
|
||||
it('should parse cdata', () => {
|
||||
expect(tokenizeAndHumanizeParts('<![cdata[t\ne\rs\r\nt]]>'))
|
||||
describe('CDATA', () => {
|
||||
it('should parse CDATA', () => {
|
||||
expect(tokenizeAndHumanizeParts('<![CDATA[t\ne\rs\r\nt]]>'))
|
||||
.toEqual([
|
||||
[HtmlTokenType.CDATA_START],
|
||||
[HtmlTokenType.RAW_TEXT, 't\ne\ns\nt'],
|
||||
|
@ -126,22 +126,22 @@ export function main() {
|
|||
});
|
||||
|
||||
it('should store the locations', () => {
|
||||
expect(tokenizeAndHumanizeSourceSpans('<![cdata[t\ne\rs\r\nt]]>'))
|
||||
expect(tokenizeAndHumanizeSourceSpans('<![CDATA[t\ne\rs\r\nt]]>'))
|
||||
.toEqual([
|
||||
[HtmlTokenType.CDATA_START, '<![cdata['],
|
||||
[HtmlTokenType.CDATA_START, '<![CDATA['],
|
||||
[HtmlTokenType.RAW_TEXT, 't\ne\rs\r\nt'],
|
||||
[HtmlTokenType.CDATA_END, ']]>'],
|
||||
[HtmlTokenType.EOF, '']
|
||||
]);
|
||||
});
|
||||
|
||||
it('should report <![ without cdata[', () => {
|
||||
it('should report <![ without CDATA[', () => {
|
||||
expect(tokenizeAndHumanizeErrors('<![a'))
|
||||
.toEqual([[HtmlTokenType.CDATA_START, 'Unexpected character "a"', '0:3']]);
|
||||
});
|
||||
|
||||
it('should report missing end cdata', () => {
|
||||
expect(tokenizeAndHumanizeErrors('<![cdata['))
|
||||
expect(tokenizeAndHumanizeErrors('<![CDATA['))
|
||||
.toEqual([[HtmlTokenType.RAW_TEXT, 'Unexpected character "EOF"', '0:9']]);
|
||||
});
|
||||
});
|
||||
|
@ -367,8 +367,8 @@ export function main() {
|
|||
});
|
||||
|
||||
it('should parse hexadecimal entities', () => {
|
||||
expect(tokenizeAndHumanizeParts('A'))
|
||||
.toEqual([[HtmlTokenType.TEXT, 'A'], [HtmlTokenType.EOF]]);
|
||||
expect(tokenizeAndHumanizeParts('AA'))
|
||||
.toEqual([[HtmlTokenType.TEXT, 'AA'], [HtmlTokenType.EOF]]);
|
||||
});
|
||||
|
||||
it('should parse decimal entities', () => {
|
||||
|
@ -473,7 +473,7 @@ export function main() {
|
|||
});
|
||||
|
||||
it('should not detect entities', () => {
|
||||
expect(tokenizeAndHumanizeParts(`<script>&</script>`))
|
||||
expect(tokenizeAndHumanizeParts(`<script>&</SCRIPT>`))
|
||||
.toEqual([
|
||||
[HtmlTokenType.TAG_OPEN_START, null, 'script'],
|
||||
[HtmlTokenType.TAG_OPEN_END],
|
||||
|
@ -587,6 +587,19 @@ export function main() {
|
|||
});
|
||||
});
|
||||
|
||||
describe('unicode characters', () => {
|
||||
it('should support unicode characters', () => {
|
||||
expect(tokenizeAndHumanizeSourceSpans(`<p>İ</p>`))
|
||||
.toEqual([
|
||||
[HtmlTokenType.TAG_OPEN_START, '<p'],
|
||||
[HtmlTokenType.TAG_OPEN_END, '>'],
|
||||
[HtmlTokenType.TEXT, 'İ'],
|
||||
[HtmlTokenType.TAG_CLOSE, '</p>'],
|
||||
[HtmlTokenType.EOF, '']
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue