From a24ee6add4e63db68e0faa8337811a7122b6fdb0 Mon Sep 17 00:00:00 2001
From: Victor Berchet <victor@suumit.com>
Date: Mon, 21 Dec 2015 11:32:58 -0800
Subject: [PATCH] fix(HtmlLexer): fix for unicode chars

fixes #6036
Closes #6061
---
 modules/angular2/src/compiler/html_lexer.ts   | 127 +++++++++++-------
 .../angular2/test/compiler/html_lexer_spec.ts |  33 +++--
 2 files changed, 99 insertions(+), 61 deletions(-)

diff --git a/modules/angular2/src/compiler/html_lexer.ts b/modules/angular2/src/compiler/html_lexer.ts
index 2420e9d60c..b346ae45b2 100644
--- a/modules/angular2/src/compiler/html_lexer.ts
+++ b/modules/angular2/src/compiler/html_lexer.ts
@@ -73,10 +73,13 @@ const $LT = 60;
 const $EQ = 61;
 const $GT = 62;
 const $QUESTION = 63;
-const $A = 65;
-const $Z = 90;
 const $LBRACKET = 91;
 const $RBRACKET = 93;
+const $A = 65;
+const $F = 70;
+const $X = 88;
+const $Z = 90;
+
 const $a = 97;
 const $f = 102;
 const $z = 122;
@@ -102,7 +105,6 @@ class ControlFlowError {
 // See http://www.w3.org/TR/html51/syntax.html#writing
 class _HtmlTokenizer {
   private input: string;
-  private inputLowercase: string;
   private length: number;
   // Note: this is always lowercase!
   private peek: number = -1;
@@ -117,7 +119,6 @@ class _HtmlTokenizer {
 
   constructor(private file: ParseSourceFile) {
     this.input = file.content;
-    this.inputLowercase = file.content.toLowerCase();
     this.length = file.content.length;
     this._advance();
   }
@@ -133,16 +134,16 @@ class _HtmlTokenizer {
     while (this.peek !== $EOF) {
       var start = this._getLocation();
       try {
-        if (this._attemptChar($LT)) {
-          if (this._attemptChar($BANG)) {
-            if (this._attemptChar($LBRACKET)) {
+        if (this._attemptCharCode($LT)) {
+          if (this._attemptCharCode($BANG)) {
+            if (this._attemptCharCode($LBRACKET)) {
               this._consumeCdata(start);
-            } else if (this._attemptChar($MINUS)) {
+            } else if (this._attemptCharCode($MINUS)) {
               this._consumeComment(start);
             } else {
               this._consumeDocType(start);
             }
-          } else if (this._attemptChar($SLASH)) {
+          } else if (this._attemptCharCode($SLASH)) {
             this._consumeTagClose(start);
           } else {
             this._consumeTagOpen(start);
@@ -205,11 +206,10 @@ class _HtmlTokenizer {
       this.column++;
     }
     this.index++;
-    this.peek = this.index >= this.length ? $EOF : StringWrapper.charCodeAt(this.inputLowercase,
-                                                                            this.index);
+    this.peek = this.index >= this.length ? $EOF : StringWrapper.charCodeAt(this.input, this.index);
   }
 
-  private _attemptChar(charCode: number): boolean {
+  private _attemptCharCode(charCode: number): boolean {
     if (this.peek === charCode) {
       this._advance();
       return true;
@@ -217,38 +217,55 @@ class _HtmlTokenizer {
     return false;
   }
 
-  private _requireChar(charCode: number) {
+  private _attemptCharCodeCaseInsensitive(charCode: number): boolean {
+    if (compareCharCodeCaseInsensitive(this.peek, charCode)) {
+      this._advance();
+      return true;
+    }
+    return false;
+  }
+
+  private _requireCharCode(charCode: number) {
     var location = this._getLocation();
-    if (!this._attemptChar(charCode)) {
+    if (!this._attemptCharCode(charCode)) {
       throw this._createError(unexpectedCharacterErrorMsg(this.peek), location);
     }
   }
 
-  private _attemptChars(chars: string): boolean {
+  private _attemptStr(chars: string): boolean {
     for (var i = 0; i < chars.length; i++) {
-      if (!this._attemptChar(StringWrapper.charCodeAt(chars, i))) {
+      if (!this._attemptCharCode(StringWrapper.charCodeAt(chars, i))) {
         return false;
       }
     }
     return true;
   }
 
-  private _requireChars(chars: string) {
+  private _attemptStrCaseInsensitive(chars: string): boolean {
+    for (var i = 0; i < chars.length; i++) {
+      if (!this._attemptCharCodeCaseInsensitive(StringWrapper.charCodeAt(chars, i))) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  private _requireStr(chars: string) {
     var location = this._getLocation();
-    if (!this._attemptChars(chars)) {
+    if (!this._attemptStr(chars)) {
       throw this._createError(unexpectedCharacterErrorMsg(this.peek), location);
     }
   }
 
-  private _attemptUntilFn(predicate: Function) {
+  private _attemptCharCodeUntilFn(predicate: Function) {
     while (!predicate(this.peek)) {
       this._advance();
     }
   }
 
-  private _requireUntilFn(predicate: Function, len: number) {
+  private _requireCharCodeUntilFn(predicate: Function, len: number) {
     var start = this._getLocation();
-    this._attemptUntilFn(predicate);
+    this._attemptCharCodeUntilFn(predicate);
     if (this.index - start.offset < len) {
       throw this._createError(unexpectedCharacterErrorMsg(this.peek), start);
     }
@@ -273,10 +290,10 @@ class _HtmlTokenizer {
   private _decodeEntity(): string {
     var start = this._getLocation();
     this._advance();
-    if (this._attemptChar($HASH)) {
-      let isHex = this._attemptChar($x);
+    if (this._attemptCharCode($HASH)) {
+      let isHex = this._attemptCharCode($x) || this._attemptCharCode($X);
       let numberStart = this._getLocation().offset;
-      this._attemptUntilFn(isDigitEntityEnd);
+      this._attemptCharCodeUntilFn(isDigitEntityEnd);
       if (this.peek != $SEMICOLON) {
         throw this._createError(unexpectedCharacterErrorMsg(this.peek), this._getLocation());
       }
@@ -291,7 +308,7 @@ class _HtmlTokenizer {
       }
     } else {
       let startPosition = this._savePosition();
-      this._attemptUntilFn(isNamedEntityEnd);
+      this._attemptCharCodeUntilFn(isNamedEntityEnd);
       if (this.peek != $SEMICOLON) {
         this._restorePosition(startPosition);
         return '&';
@@ -315,7 +332,7 @@ class _HtmlTokenizer {
     var parts = [];
     while (true) {
       tagCloseStart = this._getLocation();
-      if (this._attemptChar(firstCharOfEnd) && attemptEndRest()) {
+      if (this._attemptCharCode(firstCharOfEnd) && attemptEndRest()) {
         break;
       }
       if (this.index > tagCloseStart.offset) {
@@ -330,18 +347,18 @@ class _HtmlTokenizer {
 
   private _consumeComment(start: ParseLocation) {
     this._beginToken(HtmlTokenType.COMMENT_START, start);
-    this._requireChar($MINUS);
+    this._requireCharCode($MINUS);
     this._endToken([]);
-    var textToken = this._consumeRawText(false, $MINUS, () => this._attemptChars('->'));
+    var textToken = this._consumeRawText(false, $MINUS, () => this._attemptStr('->'));
     this._beginToken(HtmlTokenType.COMMENT_END, textToken.sourceSpan.end);
     this._endToken([]);
   }
 
   private _consumeCdata(start: ParseLocation) {
     this._beginToken(HtmlTokenType.CDATA_START, start);
-    this._requireChars('cdata[');
+    this._requireStr('CDATA[');
     this._endToken([]);
-    var textToken = this._consumeRawText(false, $RBRACKET, () => this._attemptChars(']>'));
+    var textToken = this._consumeRawText(false, $RBRACKET, () => this._attemptStr(']>'));
     this._beginToken(HtmlTokenType.CDATA_END, textToken.sourceSpan.end);
     this._endToken([]);
   }
@@ -367,7 +384,7 @@ class _HtmlTokenizer {
     } else {
       nameStart = nameOrPrefixStart;
     }
-    this._requireUntilFn(isNameEnd, this.index === nameStart ? 1 : 0);
+    this._requireCharCodeUntilFn(isNameEnd, this.index === nameStart ? 1 : 0);
     var name = this.input.substring(nameStart, this.index);
     return [prefix, name];
   }
@@ -381,16 +398,16 @@ class _HtmlTokenizer {
       }
       var nameStart = this.index;
       this._consumeTagOpenStart(start);
-      lowercaseTagName = this.inputLowercase.substring(nameStart, this.index);
-      this._attemptUntilFn(isNotWhitespace);
+      lowercaseTagName = this.input.substring(nameStart, this.index).toLowerCase();
+      this._attemptCharCodeUntilFn(isNotWhitespace);
       while (this.peek !== $SLASH && this.peek !== $GT) {
         this._consumeAttributeName();
-        this._attemptUntilFn(isNotWhitespace);
-        if (this._attemptChar($EQ)) {
-          this._attemptUntilFn(isNotWhitespace);
+        this._attemptCharCodeUntilFn(isNotWhitespace);
+        if (this._attemptCharCode($EQ)) {
+          this._attemptCharCodeUntilFn(isNotWhitespace);
           this._consumeAttributeValue();
         }
-        this._attemptUntilFn(isNotWhitespace);
+        this._attemptCharCodeUntilFn(isNotWhitespace);
       }
       this._consumeTagOpenEnd();
     } catch (e) {
@@ -416,11 +433,11 @@ class _HtmlTokenizer {
 
   private _consumeRawTextWithTagClose(lowercaseTagName: string, decodeEntities: boolean) {
     var textToken = this._consumeRawText(decodeEntities, $LT, () => {
-      if (!this._attemptChar($SLASH)) return false;
-      this._attemptUntilFn(isNotWhitespace);
-      if (!this._attemptChars(lowercaseTagName)) return false;
-      this._attemptUntilFn(isNotWhitespace);
-      if (!this._attemptChar($GT)) return false;
+      if (!this._attemptCharCode($SLASH)) return false;
+      this._attemptCharCodeUntilFn(isNotWhitespace);
+      if (!this._attemptStrCaseInsensitive(lowercaseTagName)) return false;
+      this._attemptCharCodeUntilFn(isNotWhitespace);
+      if (!this._attemptCharCode($GT)) return false;
       return true;
     });
     this._beginToken(HtmlTokenType.TAG_CLOSE, textToken.sourceSpan.end);
@@ -453,27 +470,27 @@ class _HtmlTokenizer {
       this._advance();
     } else {
       var valueStart = this.index;
-      this._requireUntilFn(isNameEnd, 1);
+      this._requireCharCodeUntilFn(isNameEnd, 1);
       value = this.input.substring(valueStart, this.index);
     }
     this._endToken([this._processCarriageReturns(value)]);
   }
 
   private _consumeTagOpenEnd() {
-    var tokenType =
-        this._attemptChar($SLASH) ? HtmlTokenType.TAG_OPEN_END_VOID : HtmlTokenType.TAG_OPEN_END;
+    var tokenType = this._attemptCharCode($SLASH) ? HtmlTokenType.TAG_OPEN_END_VOID :
+                                                    HtmlTokenType.TAG_OPEN_END;
     this._beginToken(tokenType);
-    this._requireChar($GT);
+    this._requireCharCode($GT);
     this._endToken([]);
   }
 
   private _consumeTagClose(start: ParseLocation) {
     this._beginToken(HtmlTokenType.TAG_CLOSE, start);
-    this._attemptUntilFn(isNotWhitespace);
+    this._attemptCharCodeUntilFn(isNotWhitespace);
     var prefixAndName;
     prefixAndName = this._consumePrefixAndName();
-    this._attemptUntilFn(isNotWhitespace);
-    this._requireChar($GT);
+    this._attemptCharCodeUntilFn(isNotWhitespace);
+    this._requireCharCode($GT);
     this._endToken(prefixAndName);
   }
 
@@ -534,11 +551,19 @@ function isTextEnd(code: number): boolean {
 }
 
 function isAsciiLetter(code: number): boolean {
-  return code >= $a && code <= $z;
+  return code >= $a && code <= $z || code >= $A && code <= $Z;
 }
 
 function isAsciiHexDigit(code: number): boolean {
-  return code >= $a && code <= $f || code >= $0 && code <= $9;
+  return code >= $a && code <= $f || code >= $A && code <= $F || code >= $0 && code <= $9;
+}
+
+function compareCharCodeCaseInsensitive(code1: number, code2: number): boolean {
+  return toUpperCaseCharCode(code1) == toUpperCaseCharCode(code2);
+}
+
+function toUpperCaseCharCode(code: number): number {
+  return code >= $a && code <= $z ? code - $a + $A : code;
 }
 
 function mergeTextTokens(srcTokens: HtmlToken[]): HtmlToken[] {
diff --git a/modules/angular2/test/compiler/html_lexer_spec.ts b/modules/angular2/test/compiler/html_lexer_spec.ts
index 5e61887eef..0ca26328e1 100644
--- a/modules/angular2/test/compiler/html_lexer_spec.ts
+++ b/modules/angular2/test/compiler/html_lexer_spec.ts
@@ -114,9 +114,9 @@ export function main() {
       });
     });
 
-    describe('cdata', () => {
-      it('should parse cdata', () => {
-        expect(tokenizeAndHumanizeParts('<![cdata[t\ne\rs\r\nt]]>'))
+    describe('CDATA', () => {
+      it('should parse CDATA', () => {
+        expect(tokenizeAndHumanizeParts('<![CDATA[t\ne\rs\r\nt]]>'))
             .toEqual([
               [HtmlTokenType.CDATA_START],
               [HtmlTokenType.RAW_TEXT, 't\ne\ns\nt'],
@@ -126,22 +126,22 @@ export function main() {
       });
 
       it('should store the locations', () => {
-        expect(tokenizeAndHumanizeSourceSpans('<![cdata[t\ne\rs\r\nt]]>'))
+        expect(tokenizeAndHumanizeSourceSpans('<![CDATA[t\ne\rs\r\nt]]>'))
             .toEqual([
-              [HtmlTokenType.CDATA_START, '<![cdata['],
+              [HtmlTokenType.CDATA_START, '<![CDATA['],
               [HtmlTokenType.RAW_TEXT, 't\ne\rs\r\nt'],
               [HtmlTokenType.CDATA_END, ']]>'],
               [HtmlTokenType.EOF, '']
             ]);
       });
 
-      it('should report <![ without cdata[', () => {
+      it('should report <![ without CDATA[', () => {
         expect(tokenizeAndHumanizeErrors('<![a'))
             .toEqual([[HtmlTokenType.CDATA_START, 'Unexpected character "a"', '0:3']]);
       });
 
       it('should report missing end cdata', () => {
-        expect(tokenizeAndHumanizeErrors('<![cdata['))
+        expect(tokenizeAndHumanizeErrors('<![CDATA['))
             .toEqual([[HtmlTokenType.RAW_TEXT, 'Unexpected character "EOF"', '0:9']]);
       });
     });
@@ -367,8 +367,8 @@ export function main() {
       });
 
       it('should parse hexadecimal entities', () => {
-        expect(tokenizeAndHumanizeParts('&#x41;'))
-            .toEqual([[HtmlTokenType.TEXT, 'A'], [HtmlTokenType.EOF]]);
+        expect(tokenizeAndHumanizeParts('&#x41;&#X41;'))
+            .toEqual([[HtmlTokenType.TEXT, 'AA'], [HtmlTokenType.EOF]]);
       });
 
       it('should parse decimal entities', () => {
@@ -473,7 +473,7 @@ export function main() {
       });
 
       it('should not detect entities', () => {
-        expect(tokenizeAndHumanizeParts(`<script>&amp;</script>`))
+        expect(tokenizeAndHumanizeParts(`<script>&amp;</SCRIPT>`))
             .toEqual([
               [HtmlTokenType.TAG_OPEN_START, null, 'script'],
               [HtmlTokenType.TAG_OPEN_END],
@@ -587,6 +587,19 @@ export function main() {
       });
     });
 
+    describe('unicode characters', () => {
+      it('should support unicode characters', () => {
+        expect(tokenizeAndHumanizeSourceSpans(`<p>İ</p>`))
+            .toEqual([
+              [HtmlTokenType.TAG_OPEN_START, '<p'],
+              [HtmlTokenType.TAG_OPEN_END, '>'],
+              [HtmlTokenType.TEXT, 'İ'],
+              [HtmlTokenType.TAG_CLOSE, '</p>'],
+              [HtmlTokenType.EOF, '']
+            ]);
+      });
+    });
+
   });
 }