diff --git a/packages/compiler/src/ml_parser/lexer.ts b/packages/compiler/src/ml_parser/lexer.ts index 0e2f44f2b0..ef798bb3aa 100644 --- a/packages/compiler/src/ml_parser/lexer.ts +++ b/packages/compiler/src/ml_parser/lexer.ts @@ -120,17 +120,10 @@ class _ControlFlowError { // See http://www.w3.org/TR/html51/syntax.html#writing class _Tokenizer { - private _input: string; - private _end: number; + private _cursor: CharacterCursor; private _tokenizeIcu: boolean; private _interpolationConfig: InterpolationConfig; - private _escapedString: boolean; - private _peek: number = -1; - private _nextPeek: number = -1; - private _index: number; - private _line: number; - private _column: number; - private _currentTokenStart: ParseLocation|null = null; + private _currentTokenStart: CharacterCursor|null = null; private _currentTokenType: TokenType|null = null; private _expansionCaseStack: TokenType[] = []; private _inInterpolation: boolean = false; @@ -145,31 +138,18 @@ class _Tokenizer { * @param _interpolationConfig */ constructor( - private _file: ParseSourceFile, private _getTagDefinition: (tagName: string) => TagDefinition, + _file: ParseSourceFile, private _getTagDefinition: (tagName: string) => TagDefinition, options: TokenizeOptions) { this._tokenizeIcu = options.tokenizeExpansionForms || false; this._interpolationConfig = options.interpolationConfig || DEFAULT_INTERPOLATION_CONFIG; - this._escapedString = options.escapedString || false; - this._input = _file.content; - if (options.range) { - this._end = options.range.endPos; - this._index = options.range.startPos; - this._line = options.range.startLine; - this._column = options.range.startCol; - } else { - this._end = this._input.length; - this._index = 0; - this._line = 0; - this._column = 0; - } + const range = + options.range || {endPos: _file.content.length, startPos: 0, startLine: 0, startCol: 0}; + this._cursor = options.escapedString ? new EscapedCharacterCursor(_file, range) : + new PlainCharacterCursor(_file, range); try { - this._initPeek(); + this._cursor.init(); } catch (e) { - if (e instanceof _ControlFlowError) { - this.errors.push(e.error); - } else { - throw e; - } + this.handleError(e); } } @@ -182,8 +162,8 @@ class _Tokenizer { } tokenize(): TokenizeResult { - while (this._peek !== chars.$EOF) { - const start = this._getLocation(); + while (this._cursor.peek() !== chars.$EOF) { + const start = this._cursor.clone(); try { if (this._attemptCharCode(chars.$LT)) { if (this._attemptCharCode(chars.$BANG)) { @@ -203,11 +183,7 @@ class _Tokenizer { this._consumeText(); } } catch (e) { - if (e instanceof _ControlFlowError) { - this.errors.push(e.error); - } else { - throw e; - } + this.handleError(e); } } this._beginToken(TokenType.EOF); @@ -220,17 +196,17 @@ class _Tokenizer { * @internal */ private _tokenizeExpansionForm(): boolean { - if (isExpansionFormStart(this._input, this._index, this._interpolationConfig)) { + if (this.isExpansionFormStart()) { this._consumeExpansionFormStart(); return true; } - if (isExpansionCaseStart(this._peek) && this._isInExpansionForm()) { + if (isExpansionCaseStart(this._cursor.peek()) && this._isInExpansionForm()) { this._consumeExpansionCaseStart(); return true; } - if (this._peek === chars.$RBRACE) { + if (this._cursor.peek() === chars.$RBRACE) { if (this._isInExpansionCase()) { this._consumeExpansionCaseEnd(); return true; @@ -245,34 +221,24 @@ class _Tokenizer { return false; } - private _getLocation(): ParseLocation { - return new ParseLocation(this._file, this._index, this._line, this._column); - } - - private _getSpan( - start: ParseLocation = this._getLocation(), - end: ParseLocation = this._getLocation()): ParseSourceSpan { - return new ParseSourceSpan(start, end); - } - - private _beginToken(type: TokenType, start: ParseLocation = this._getLocation()) { + private _beginToken(type: TokenType, start = this._cursor.clone()) { this._currentTokenStart = start; this._currentTokenType = type; } - private _endToken(parts: string[], end: ParseLocation = this._getLocation()): Token { + private _endToken(parts: string[], end = this._cursor.clone()): Token { if (this._currentTokenStart === null) { throw new TokenError( 'Programming error - attempted to end a token when there was no start to the token', - this._currentTokenType, this._getSpan(end, end)); + this._currentTokenType, this._cursor.getSpan(end)); } if (this._currentTokenType === null) { throw new TokenError( 'Programming error - attempted to end a token which has no token type', null, - this._getSpan(this._currentTokenStart, end)); + this._cursor.getSpan(this._currentTokenStart)); } const token = - new Token(this._currentTokenType, parts, new ParseSourceSpan(this._currentTokenStart, end)); + new Token(this._currentTokenType, parts, this._cursor.getSpan(this._currentTokenStart)); this.tokens.push(token); this._currentTokenStart = null; this._currentTokenType = null; @@ -284,89 +250,57 @@ class _Tokenizer { msg += ` (Do you have an unescaped "{" in your template? Use "{{ '{' }}") to escape it.)`; } const error = new TokenError(msg, this._currentTokenType, span); - this._currentTokenStart = null !; - this._currentTokenType = null !; + this._currentTokenStart = null; + this._currentTokenType = null; return new _ControlFlowError(error); } - private _advance(processingEscapeSequence?: boolean) { - if (this._index >= this._end) { - throw this._createError(_unexpectedCharacterErrorMsg(chars.$EOF), this._getSpan()); + private handleError(e: any) { + if (e instanceof CursorError) { + e = this._createError(e.msg, this._cursor.getSpan(e.cursor)); } - // The actual character in the input might be different to the _peek if we are processing - // escape characters. We only want to track "real" new lines. - const actualChar = this._input.charCodeAt(this._index); - if (actualChar === chars.$LF) { - this._line++; - this._column = 0; - } else if (!chars.isNewLine(actualChar)) { - this._column++; - } - this._index++; - this._initPeek(processingEscapeSequence); - } - - /** - * Initialize the _peek and _nextPeek properties based on the current _index. - * @param processingEscapeSequence whether we are in the middle of processing an escape sequence. - */ - private _initPeek(processingEscapeSequence?: boolean) { - this._peek = this._index >= this._end ? chars.$EOF : this._input.charCodeAt(this._index); - this._nextPeek = - this._index + 1 >= this._end ? chars.$EOF : this._input.charCodeAt(this._index + 1); - if (this._peek === chars.$BACKSLASH && processingEscapeSequence !== true && - this._escapedString) { - this._processEscapeSequence(); - } - } - - /** - * Advance the specific number of characters. - * @param count The number of characters to advance. - * @param processingEscapeSequence Whether we want `advance()` to process escape sequences. - */ - private _advanceN(count: number, processingEscapeSequence?: boolean) { - while (count) { - this._advance(processingEscapeSequence); - count--; + if (e instanceof _ControlFlowError) { + this.errors.push(e.error); + } else { + throw e; } } private _attemptCharCode(charCode: number): boolean { - if (this._peek === charCode) { - this._advance(); + if (this._cursor.peek() === charCode) { + this._cursor.advance(); return true; } return false; } private _attemptCharCodeCaseInsensitive(charCode: number): boolean { - if (compareCharCodeCaseInsensitive(this._peek, charCode)) { - this._advance(); + if (compareCharCodeCaseInsensitive(this._cursor.peek(), charCode)) { + this._cursor.advance(); return true; } return false; } private _requireCharCode(charCode: number) { - const location = this._getLocation(); + const location = this._cursor.clone(); if (!this._attemptCharCode(charCode)) { throw this._createError( - _unexpectedCharacterErrorMsg(this._peek), this._getSpan(location, location)); + _unexpectedCharacterErrorMsg(this._cursor.peek()), this._cursor.getSpan(location)); } } private _attemptStr(chars: string): boolean { const len = chars.length; - if (this._index + len > this._end) { + if (this._cursor.charsLeft() < len) { return false; } - const initialPosition = this._savePosition(); + const initialPosition = this._cursor.clone(); for (let i = 0; i < len; i++) { if (!this._attemptCharCode(chars.charCodeAt(i))) { // If attempting to parse the string fails, we want to reset the parser // to where it was before the attempt - this._restorePosition(initialPosition); + this._cursor = initialPosition; return false; } } @@ -383,277 +317,162 @@ class _Tokenizer { } private _requireStr(chars: string) { - const location = this._getLocation(); + const location = this._cursor.clone(); if (!this._attemptStr(chars)) { - throw this._createError(_unexpectedCharacterErrorMsg(this._peek), this._getSpan(location)); + throw this._createError( + _unexpectedCharacterErrorMsg(this._cursor.peek()), this._cursor.getSpan(location)); } } private _attemptCharCodeUntilFn(predicate: (code: number) => boolean) { - while (!predicate(this._peek)) { - this._advance(); + while (!predicate(this._cursor.peek())) { + this._cursor.advance(); } } private _requireCharCodeUntilFn(predicate: (code: number) => boolean, len: number) { - const start = this._getLocation(); + const start = this._cursor.clone(); this._attemptCharCodeUntilFn(predicate); - if (this._index - start.offset < len) { + const end = this._cursor.clone(); + if (end.diff(start) < len) { throw this._createError( - _unexpectedCharacterErrorMsg(this._peek), this._getSpan(start, start)); + _unexpectedCharacterErrorMsg(this._cursor.peek()), this._cursor.getSpan(start)); } } private _attemptUntilChar(char: number) { - while (this._peek !== char) { - this._advance(); + while (this._cursor.peek() !== char) { + this._cursor.advance(); } } private _readChar(decodeEntities: boolean): string { - if (decodeEntities && this._peek === chars.$AMPERSAND) { + if (decodeEntities && this._cursor.peek() === chars.$AMPERSAND) { return this._decodeEntity(); } else { // Don't rely upon reading directly from `_input` as the actual char value // may have been generated from an escape sequence. - const char = String.fromCodePoint(this._peek); - this._advance(); + const char = String.fromCodePoint(this._cursor.peek()); + this._cursor.advance(); return char; } } private _decodeEntity(): string { - const start = this._getLocation(); - this._advance(); + const start = this._cursor.clone(); + this._cursor.advance(); if (this._attemptCharCode(chars.$HASH)) { const isHex = this._attemptCharCode(chars.$x) || this._attemptCharCode(chars.$X); - const numberStart = this._getLocation().offset; + const codeStart = this._cursor.clone(); this._attemptCharCodeUntilFn(isDigitEntityEnd); - if (this._peek != chars.$SEMICOLON) { - throw this._createError(_unexpectedCharacterErrorMsg(this._peek), this._getSpan()); + if (this._cursor.peek() != chars.$SEMICOLON) { + throw this._createError( + _unexpectedCharacterErrorMsg(this._cursor.peek()), this._cursor.getSpan()); } - this._advance(); - const strNum = this._input.substring(numberStart, this._index - 1); + const strNum = this._cursor.getChars(codeStart); + this._cursor.advance(); try { const charCode = parseInt(strNum, isHex ? 16 : 10); return String.fromCharCode(charCode); } catch { - const entity = this._input.substring(start.offset + 1, this._index - 1); - throw this._createError(_unknownEntityErrorMsg(entity), this._getSpan(start)); + throw this._createError( + _unknownEntityErrorMsg(this._cursor.getChars(start)), this._cursor.getSpan()); } } else { - const startPosition = this._savePosition(); + const nameStart = this._cursor.clone(); this._attemptCharCodeUntilFn(isNamedEntityEnd); - if (this._peek != chars.$SEMICOLON) { - this._restorePosition(startPosition); + if (this._cursor.peek() != chars.$SEMICOLON) { + this._cursor = nameStart; return '&'; } - this._advance(); - const name = this._input.substring(start.offset + 1, this._index - 1); + const name = this._cursor.getChars(nameStart); + this._cursor.advance(); const char = NAMED_ENTITIES[name]; if (!char) { - throw this._createError(_unknownEntityErrorMsg(name), this._getSpan(start)); + throw this._createError(_unknownEntityErrorMsg(name), this._cursor.getSpan(start)); } return char; } } - /** - * Process the escape sequence that starts at the current position in the text. - * - * This method is called from `_advance()` to ensure that escape sequences are - * always processed correctly however tokens are being consumed. - * - * But note that this method also calls `_advance()` (re-entering) to move through - * the characters within an escape sequence. In that case it tells `_advance()` not - * to attempt to process further escape sequences by passing `true` as its first - * argument. - */ - private _processEscapeSequence(): void { - this._advance(true); // advance past the backslash - - // First check for standard control char sequences - if (this._peekChar() === chars.$n) { - this._peek = chars.$LF; - } else if (this._peekChar() === chars.$r) { - this._peek = chars.$CR; - } else if (this._peekChar() === chars.$v) { - this._peek = chars.$VTAB; - } else if (this._peekChar() === chars.$t) { - this._peek = chars.$TAB; - } else if (this._peekChar() === chars.$b) { - this._peek = chars.$BSPACE; - } else if (this._peekChar() === chars.$f) { - this._peek = chars.$FF; - } - - // Now consider more complex sequences - - else if (this._peekChar() === chars.$u) { - // Unicode code-point sequence - this._advance(true); // advance past the `u` char - if (this._peekChar() === chars.$LBRACE) { - // Variable length Unicode, e.g. `\x{123}` - this._advance(true); // advance past the `{` char - // Advance past the variable number of hex digits until we hit a `}` char - const start = this._getLocation(); - while (this._peekChar() !== chars.$RBRACE) { - this._advance(true); - } - this._decodeHexDigits(start, this._index - start.offset); - } else { - // Fixed length Unicode, e.g. `\u1234` - this._parseFixedHexSequence(4); - } - } - - else if (this._peekChar() === chars.$x) { - // Hex char code, e.g. `\x2F` - this._advance(true); // advance past the `x` char - this._parseFixedHexSequence(2); - } - - else if (chars.isOctalDigit(this._peekChar())) { - // Octal char code, e.g. `\012`, - const start = this._index; - let length = 1; - // Note that we work with `_nextPeek` because, although we check the next character - // after the sequence to find the end of the sequence, - // we do not want to advance that far to check the character, otherwise we will - // have to back up. - while (chars.isOctalDigit(this._nextPeek) && length < 3) { - this._advance(true); - length++; - } - const octal = this._input.substr(start, length); - this._peek = parseInt(octal, 8); - } - - else if (chars.isNewLine(this._peekChar())) { - // Line continuation `\` followed by a new line - this._advance(true); // advance over the newline - } - - // If none of the `if` blocks were executed then we just have an escaped normal character. - // In that case we just, effectively, skip the backslash from the character. - } - - private _parseFixedHexSequence(length: number) { - const start = this._getLocation(); - this._advanceN(length - 1, true); - this._decodeHexDigits(start, length); - } - - private _decodeHexDigits(start: ParseLocation, length: number) { - const hex = this._input.substr(start.offset, length); - const charCode = parseInt(hex, 16); - if (!isNaN(charCode)) { - this._peek = charCode; - } else { - throw this._createError( - 'Invalid hexadecimal escape sequence', this._getSpan(start, this._getLocation())); - } - } - - /** - * This little helper is to solve a problem where the TS compiler will narrow - * the type of `_peek` after an `if` statment, even if there is a call to a - * method that might mutate the `_peek`. - * - * For example: - * - * ``` - * if (this._peek === 10) { - * this._advance(); // mutates _peek - * if (this._peek === 20) { - * ... - * ``` - * - * The second if statement fails TS compilation because the compiler has determined - * that `_peek` is `10` and so can never be equal to `20`. - */ - private _peekChar(): number { return this._peek; } - - private _consumeRawText( - decodeEntities: boolean, firstCharOfEnd: number, attemptEndRest: () => boolean): Token { - let tagCloseStart: ParseLocation; - const textStart = this._getLocation(); - this._beginToken(decodeEntities ? TokenType.ESCAPABLE_RAW_TEXT : TokenType.RAW_TEXT, textStart); + private _consumeRawText(decodeEntities: boolean, endMarkerPredicate: () => boolean): Token { + this._beginToken(decodeEntities ? TokenType.ESCAPABLE_RAW_TEXT : TokenType.RAW_TEXT); const parts: string[] = []; while (true) { - tagCloseStart = this._getLocation(); - if (this._attemptCharCode(firstCharOfEnd) && attemptEndRest()) { + const tagCloseStart = this._cursor.clone(); + const foundEndMarker = endMarkerPredicate(); + this._cursor = tagCloseStart; + if (foundEndMarker) { break; } - if (this._index > tagCloseStart.offset) { - // add the characters consumed by the previous if statement to the output - parts.push(this._input.substring(tagCloseStart.offset, this._index)); - } - while (this._peek !== firstCharOfEnd) { - parts.push(this._readChar(decodeEntities)); - } + parts.push(this._readChar(decodeEntities)); } - return this._endToken([this._processCarriageReturns(parts.join(''))], tagCloseStart); + return this._endToken([this._processCarriageReturns(parts.join(''))]); } - private _consumeComment(start: ParseLocation) { + private _consumeComment(start: CharacterCursor) { this._beginToken(TokenType.COMMENT_START, start); this._requireCharCode(chars.$MINUS); this._endToken([]); - const textToken = this._consumeRawText(false, chars.$MINUS, () => this._attemptStr('->')); - this._beginToken(TokenType.COMMENT_END, textToken.sourceSpan.end); + this._consumeRawText(false, () => this._attemptStr('-->')); + this._beginToken(TokenType.COMMENT_END); + this._requireStr('-->'); this._endToken([]); } - private _consumeCdata(start: ParseLocation) { + private _consumeCdata(start: CharacterCursor) { this._beginToken(TokenType.CDATA_START, start); this._requireStr('CDATA['); this._endToken([]); - const textToken = this._consumeRawText(false, chars.$RBRACKET, () => this._attemptStr(']>')); - this._beginToken(TokenType.CDATA_END, textToken.sourceSpan.end); + this._consumeRawText(false, () => this._attemptStr(']]>')); + this._beginToken(TokenType.CDATA_END); + this._requireStr(']]>'); this._endToken([]); } - private _consumeDocType(start: ParseLocation) { + private _consumeDocType(start: CharacterCursor) { this._beginToken(TokenType.DOC_TYPE, start); + const contentStart = this._cursor.clone(); this._attemptUntilChar(chars.$GT); - this._advance(); - this._endToken([this._input.substring(start.offset + 2, this._index - 1)]); + const content = this._cursor.getChars(contentStart); + this._cursor.advance(); + this._endToken([content]); } private _consumePrefixAndName(): string[] { - const nameOrPrefixStart = this._index; + const nameOrPrefixStart = this._cursor.clone(); let prefix: string = ''; - while (this._peek !== chars.$COLON && !isPrefixEnd(this._peek)) { - this._advance(); + while (this._cursor.peek() !== chars.$COLON && !isPrefixEnd(this._cursor.peek())) { + this._cursor.advance(); } - let nameStart: number; - if (this._peek === chars.$COLON) { - this._advance(); - prefix = this._input.substring(nameOrPrefixStart, this._index - 1); - nameStart = this._index; + let nameStart: CharacterCursor; + if (this._cursor.peek() === chars.$COLON) { + prefix = this._cursor.getChars(nameOrPrefixStart); + this._cursor.advance(); + nameStart = this._cursor.clone(); } else { nameStart = nameOrPrefixStart; } - this._requireCharCodeUntilFn(isNameEnd, this._index === nameStart ? 1 : 0); - const name = this._input.substring(nameStart, this._index); + this._requireCharCodeUntilFn(isNameEnd, prefix === '' ? 0 : 1); + const name = this._cursor.getChars(nameStart); return [prefix, name]; } - private _consumeTagOpen(start: ParseLocation) { - const savedPos = this._savePosition(); + private _consumeTagOpen(start: CharacterCursor) { let tagName: string; let prefix: string; let openTagToken: Token|undefined; + const innerStart = this._cursor.clone(); try { - if (!chars.isAsciiLetter(this._peek)) { - throw this._createError(_unexpectedCharacterErrorMsg(this._peek), this._getSpan()); + if (!chars.isAsciiLetter(this._cursor.peek())) { + throw this._createError( + _unexpectedCharacterErrorMsg(this._cursor.peek()), this._cursor.getSpan(start)); } openTagToken = this._consumeTagOpenStart(start); prefix = openTagToken.parts[0]; tagName = openTagToken.parts[1]; this._attemptCharCodeUntilFn(isNotWhitespace); - while (this._peek !== chars.$SLASH && this._peek !== chars.$GT) { + while (this._cursor.peek() !== chars.$SLASH && this._cursor.peek() !== chars.$GT) { this._consumeAttributeName(); this._attemptCharCodeUntilFn(isNotWhitespace); if (this._attemptCharCode(chars.$EQ)) { @@ -666,7 +485,10 @@ class _Tokenizer { } catch (e) { if (e instanceof _ControlFlowError) { // When the start tag is invalid, assume we want a "<" - this._restorePosition(savedPos); + this._cursor = innerStart; + if (openTagToken) { + this.tokens.pop(); + } // Back to back text tokens are merged at the end this._beginToken(TokenType.TEXT, start); this._endToken(['<']); @@ -686,18 +508,21 @@ class _Tokenizer { } private _consumeRawTextWithTagClose(prefix: string, tagName: string, decodeEntities: boolean) { - const textToken = this._consumeRawText(decodeEntities, chars.$LT, () => { + const textToken = this._consumeRawText(decodeEntities, () => { + if (!this._attemptCharCode(chars.$LT)) return false; if (!this._attemptCharCode(chars.$SLASH)) return false; this._attemptCharCodeUntilFn(isNotWhitespace); if (!this._attemptStrCaseInsensitive(tagName)) return false; this._attemptCharCodeUntilFn(isNotWhitespace); return this._attemptCharCode(chars.$GT); }); - this._beginToken(TokenType.TAG_CLOSE, textToken.sourceSpan.end); + this._beginToken(TokenType.TAG_CLOSE); + this._requireCharCodeUntilFn(code => code === chars.$GT, 3); + this._cursor.advance(); // Consume the `>` this._endToken([prefix, tagName]); } - private _consumeTagOpenStart(start: ParseLocation) { + private _consumeTagOpenStart(start: CharacterCursor) { this._beginToken(TokenType.TAG_OPEN_START, start); const parts = this._consumePrefixAndName(); return this._endToken(parts); @@ -711,26 +536,26 @@ class _Tokenizer { private _consumeAttributeValue() { let value: string; - if (this._peek === chars.$SQ || this._peek === chars.$DQ) { + if (this._cursor.peek() === chars.$SQ || this._cursor.peek() === chars.$DQ) { this._beginToken(TokenType.ATTR_QUOTE); - const quoteChar = this._peek; - this._advance(); + const quoteChar = this._cursor.peek(); + this._cursor.advance(); this._endToken([String.fromCodePoint(quoteChar)]); this._beginToken(TokenType.ATTR_VALUE); const parts: string[] = []; - while (this._peek !== quoteChar) { + while (this._cursor.peek() !== quoteChar) { parts.push(this._readChar(true)); } value = parts.join(''); this._endToken([this._processCarriageReturns(value)]); this._beginToken(TokenType.ATTR_QUOTE); - this._advance(); + this._cursor.advance(); this._endToken([String.fromCodePoint(quoteChar)]); } else { this._beginToken(TokenType.ATTR_VALUE); - const valueStart = this._index; + const valueStart = this._cursor.clone(); this._requireCharCodeUntilFn(isNameEnd, 1); - value = this._input.substring(valueStart, this._index); + value = this._cursor.getChars(valueStart); this._endToken([this._processCarriageReturns(value)]); } } @@ -743,7 +568,7 @@ class _Tokenizer { this._endToken([]); } - private _consumeTagClose(start: ParseLocation) { + private _consumeTagClose(start: CharacterCursor) { this._beginToken(TokenType.TAG_CLOSE, start); this._attemptCharCodeUntilFn(isNotWhitespace); const prefixAndName = this._consumePrefixAndName(); @@ -753,50 +578,50 @@ class _Tokenizer { } private _consumeExpansionFormStart() { - this._beginToken(TokenType.EXPANSION_FORM_START, this._getLocation()); + this._beginToken(TokenType.EXPANSION_FORM_START); this._requireCharCode(chars.$LBRACE); this._endToken([]); this._expansionCaseStack.push(TokenType.EXPANSION_FORM_START); - this._beginToken(TokenType.RAW_TEXT, this._getLocation()); + this._beginToken(TokenType.RAW_TEXT); const condition = this._readUntil(chars.$COMMA); - this._endToken([condition], this._getLocation()); + this._endToken([condition]); this._requireCharCode(chars.$COMMA); this._attemptCharCodeUntilFn(isNotWhitespace); - this._beginToken(TokenType.RAW_TEXT, this._getLocation()); + this._beginToken(TokenType.RAW_TEXT); const type = this._readUntil(chars.$COMMA); - this._endToken([type], this._getLocation()); + this._endToken([type]); this._requireCharCode(chars.$COMMA); this._attemptCharCodeUntilFn(isNotWhitespace); } private _consumeExpansionCaseStart() { - this._beginToken(TokenType.EXPANSION_CASE_VALUE, this._getLocation()); + this._beginToken(TokenType.EXPANSION_CASE_VALUE); const value = this._readUntil(chars.$LBRACE).trim(); - this._endToken([value], this._getLocation()); + this._endToken([value]); this._attemptCharCodeUntilFn(isNotWhitespace); - this._beginToken(TokenType.EXPANSION_CASE_EXP_START, this._getLocation()); + this._beginToken(TokenType.EXPANSION_CASE_EXP_START); this._requireCharCode(chars.$LBRACE); - this._endToken([], this._getLocation()); + this._endToken([]); this._attemptCharCodeUntilFn(isNotWhitespace); this._expansionCaseStack.push(TokenType.EXPANSION_CASE_EXP_START); } private _consumeExpansionCaseEnd() { - this._beginToken(TokenType.EXPANSION_CASE_EXP_END, this._getLocation()); + this._beginToken(TokenType.EXPANSION_CASE_EXP_END); this._requireCharCode(chars.$RBRACE); - this._endToken([], this._getLocation()); + this._endToken([]); this._attemptCharCodeUntilFn(isNotWhitespace); this._expansionCaseStack.pop(); } private _consumeExpansionFormEnd() { - this._beginToken(TokenType.EXPANSION_FORM_END, this._getLocation()); + this._beginToken(TokenType.EXPANSION_FORM_END); this._requireCharCode(chars.$RBRACE); this._endToken([]); @@ -804,7 +629,7 @@ class _Tokenizer { } private _consumeText() { - const start = this._getLocation(); + const start = this._cursor.clone(); this._beginToken(TokenType.TEXT, start); const parts: string[] = []; @@ -826,17 +651,17 @@ class _Tokenizer { } private _isTextEnd(): boolean { - if (this._peek === chars.$LT || this._peek === chars.$EOF) { + if (this._cursor.peek() === chars.$LT || this._cursor.peek() === chars.$EOF) { return true; } if (this._tokenizeIcu && !this._inInterpolation) { - if (isExpansionFormStart(this._input, this._index, this._interpolationConfig)) { + if (this.isExpansionFormStart()) { // start of an expansion form return true; } - if (this._peek === chars.$RBRACE && this._isInExpansionCase()) { + if (this._cursor.peek() === chars.$RBRACE && this._isInExpansionCase()) { // end of and expansion case return true; } @@ -845,26 +670,10 @@ class _Tokenizer { return false; } - private _savePosition(): [number, number, number, number, number] { - return [this._peek, this._index, this._column, this._line, this.tokens.length]; - } - private _readUntil(char: number): string { - const start = this._index; + const start = this._cursor.clone(); this._attemptUntilChar(char); - return this._input.substring(start, this._index); - } - - private _restorePosition(position: [number, number, number, number, number]): void { - this._peek = position[0]; - this._index = position[1]; - this._column = position[2]; - this._line = position[3]; - const nbTokens = position[4]; - if (nbTokens < this.tokens.length) { - // remove any extra tokens - this.tokens = this.tokens.slice(0, nbTokens); - } + return this._cursor.getChars(start); } private _isInExpansionCase(): boolean { @@ -878,6 +687,19 @@ class _Tokenizer { this._expansionCaseStack[this._expansionCaseStack.length - 1] === TokenType.EXPANSION_FORM_START; } + + private isExpansionFormStart(): boolean { + if (this._cursor.peek() !== chars.$LBRACE) { + return false; + } + if (this._interpolationConfig) { + const start = this._cursor.clone(); + const isInterpolation = this._attemptStr(this._interpolationConfig.start); + this._cursor = start; + return !isInterpolation; + } + return true; + } } function isNotWhitespace(code: number): boolean { @@ -902,14 +724,6 @@ function isNamedEntityEnd(code: number): boolean { return code == chars.$SEMICOLON || code == chars.$EOF || !chars.isAsciiLetter(code); } -function isExpansionFormStart( - input: string, offset: number, interpolationConfig: InterpolationConfig): boolean { - const isInterpolationStart = - interpolationConfig ? input.indexOf(interpolationConfig.start, offset) == offset : false; - - return input.charCodeAt(offset) == chars.$LBRACE && !isInterpolationStart; -} - function isExpansionCaseStart(peek: number): boolean { return peek === chars.$EQ || chars.isAsciiLetter(peek) || chars.isDigit(peek); } @@ -928,7 +742,7 @@ function mergeTextTokens(srcTokens: Token[]): Token[] { for (let i = 0; i < srcTokens.length; i++) { const token = srcTokens[i]; if (lastDstToken && lastDstToken.type == TokenType.TEXT && token.type == TokenType.TEXT) { - lastDstToken.parts[0] += token.parts[0]; + lastDstToken.parts[0] ! += token.parts[0]; lastDstToken.sourceSpan.end = token.sourceSpan.end; } else { lastDstToken = token; @@ -938,3 +752,258 @@ function mergeTextTokens(srcTokens: Token[]): Token[] { return dstTokens; } + + +/** + * The _Tokenizer uses objects of this type to move through the input text, + * extracting "parsed characters". These could be more than one actual character + * if the text contains escape sequences. + */ +interface CharacterCursor { + /** Initialize the cursor. */ + init(): void; + /** The parsed character at the current cursor position. */ + peek(): number; + /** Advance the cursor by one parsed character. */ + advance(): void; + /** Get a span from the marked start point to the current point. */ + getSpan(start?: this): ParseSourceSpan; + /** Get the parsed characters from the marked start point to the current point. */ + getChars(start: this): string; + /** The number of characters left before the end of the cursor. */ + charsLeft(): number; + /** The number of characters between `this` cursor and `other` cursor. */ + diff(other: this): number; + /** Make a copy of this cursor */ + clone(): CharacterCursor; +} + +interface CursorState { + peek: number; + offset: number; + line: number; + column: number; +} + +class PlainCharacterCursor implements CharacterCursor { + protected state: CursorState; + protected file: ParseSourceFile; + protected input: string; + protected end: number; + + constructor(fileOrCursor: PlainCharacterCursor); + constructor(fileOrCursor: ParseSourceFile, range: LexerRange); + constructor(fileOrCursor: ParseSourceFile|PlainCharacterCursor, range?: LexerRange) { + if (fileOrCursor instanceof PlainCharacterCursor) { + this.file = fileOrCursor.file; + this.input = fileOrCursor.input; + this.end = fileOrCursor.end; + this.state = {...fileOrCursor.state}; + } else { + if (!range) { + throw new Error( + 'Programming error: the range argument must be provided with a file argument.'); + } + this.file = fileOrCursor; + this.input = fileOrCursor.content; + this.end = range.endPos; + this.state = { + peek: -1, + offset: range.startPos, + line: range.startLine, + column: range.startCol, + }; + } + } + + clone(): PlainCharacterCursor { return new PlainCharacterCursor(this); } + + peek() { return this.state.peek; } + charsLeft() { return this.end - this.state.offset; } + diff(other: this) { return this.state.offset - other.state.offset; } + + advance(): void { this.advanceState(this.state); } + + init(): void { this.updatePeek(this.state); } + + getSpan(start?: this): ParseSourceSpan { + start = start || this; + return new ParseSourceSpan( + new ParseLocation(start.file, start.state.offset, start.state.line, start.state.column), + new ParseLocation(this.file, this.state.offset, this.state.line, this.state.column)); + } + + getChars(start: this): string { + return this.input.substring(start.state.offset, this.state.offset); + } + + charAt(pos: number): number { return this.input.charCodeAt(pos); } + + protected advanceState(state: CursorState) { + if (state.offset >= this.end) { + this.state = state; + throw new CursorError('Unexpected character "EOF"', this); + } + const currentChar = this.charAt(state.offset); + if (currentChar === chars.$LF) { + state.line++; + state.column = 0; + } else if (!chars.isNewLine(currentChar)) { + state.column++; + } + state.offset++; + this.updatePeek(state); + } + + protected updatePeek(state: CursorState): void { + state.peek = state.offset >= this.end ? chars.$EOF : this.charAt(state.offset); + } +} + +class EscapedCharacterCursor extends PlainCharacterCursor { + protected internalState: CursorState; + + constructor(fileOrCursor: EscapedCharacterCursor); + constructor(fileOrCursor: ParseSourceFile, range: LexerRange); + constructor(fileOrCursor: ParseSourceFile|EscapedCharacterCursor, range?: LexerRange) { + if (fileOrCursor instanceof EscapedCharacterCursor) { + super(fileOrCursor); + this.internalState = {...fileOrCursor.internalState}; + } else { + super(fileOrCursor, range !); + this.internalState = this.state; + } + } + + advance(): void { + this.state = this.internalState; + super.advance(); + this.processEscapeSequence(); + } + + init(): void { + super.init(); + this.processEscapeSequence(); + } + + clone(): EscapedCharacterCursor { return new EscapedCharacterCursor(this); } + + getChars(start: this): string { + const cursor = start.clone(); + let chars = ''; + while (cursor.internalState.offset < this.internalState.offset) { + chars += String.fromCodePoint(cursor.peek()); + cursor.advance(); + } + return chars; + } + + /** + * Process the escape sequence that starts at the current position in the text. + * + * This method is called to ensure that `peek` has the unescaped value of escape sequences. + */ + protected processEscapeSequence(): void { + const peek = () => this.internalState.peek; + + if (peek() === chars.$BACKSLASH) { + // We have hit an escape sequence so we need the internal state to become independent + // of the external state. + this.internalState = {...this.state}; + + // Move past the backslash + this.advanceState(this.internalState); + + // First check for standard control char sequences + if (peek() === chars.$n) { + this.state.peek = chars.$LF; + } else if (peek() === chars.$r) { + this.state.peek = chars.$CR; + } else if (peek() === chars.$v) { + this.state.peek = chars.$VTAB; + } else if (peek() === chars.$t) { + this.state.peek = chars.$TAB; + } else if (peek() === chars.$b) { + this.state.peek = chars.$BSPACE; + } else if (peek() === chars.$f) { + this.state.peek = chars.$FF; + } + + // Now consider more complex sequences + else if (peek() === chars.$u) { + // Unicode code-point sequence + this.advanceState(this.internalState); // advance past the `u` char + if (peek() === chars.$LBRACE) { + // Variable length Unicode, e.g. `\x{123}` + this.advanceState(this.internalState); // advance past the `{` char + // Advance past the variable number of hex digits until we hit a `}` char + const digitStart = this.clone(); + let length = 0; + while (peek() !== chars.$RBRACE) { + this.advanceState(this.internalState); + length++; + } + this.state.peek = this.decodeHexDigits(digitStart, length); + } else { + // Fixed length Unicode, e.g. `\u1234` + const digitStart = this.clone(); + this.advanceState(this.internalState); + this.advanceState(this.internalState); + this.advanceState(this.internalState); + this.state.peek = this.decodeHexDigits(digitStart, 4); + } + } + + else if (peek() === chars.$x) { + // Hex char code, e.g. `\x2F` + this.advanceState(this.internalState); // advance past the `x` char + const digitStart = this.clone(); + this.advanceState(this.internalState); + this.state.peek = this.decodeHexDigits(digitStart, 2); + } + + else if (chars.isOctalDigit(peek())) { + // Octal char code, e.g. `\012`, + let octal = ''; + let length = 0; + let previous = this.clone(); + while (chars.isOctalDigit(peek()) && length < 3) { + previous = this.clone(); + octal += String.fromCodePoint(peek()); + this.advanceState(this.internalState); + length++; + } + this.state.peek = parseInt(octal, 8); + // Backup one char + this.internalState = previous.internalState; + } + + else if (chars.isNewLine(this.internalState.peek)) { + // Line continuation `\` followed by a new line + this.advanceState(this.internalState); // advance over the newline + this.state = this.internalState; + } + + else { + // If none of the `if` blocks were executed then we just have an escaped normal character. + // In that case we just, effectively, skip the backslash from the character. + this.state.peek = this.internalState.peek; + } + } + } + + protected decodeHexDigits(start: EscapedCharacterCursor, length: number): number { + const hex = this.input.substr(start.internalState.offset, length); + const charCode = parseInt(hex, 16); + if (!isNaN(charCode)) { + return charCode; + } else { + start.state = start.internalState; + throw new CursorError('Invalid hexadecimal escape sequence', start); + } + } +} + +export class CursorError { + constructor(public msg: string, public cursor: CharacterCursor) {} +} diff --git a/packages/compiler/test/ml_parser/lexer_spec.ts b/packages/compiler/test/ml_parser/lexer_spec.ts index 7f527d4e65..897ff19654 100644 --- a/packages/compiler/test/ml_parser/lexer_spec.ts +++ b/packages/compiler/test/ml_parser/lexer_spec.ts @@ -456,7 +456,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u lex.TokenType.TEXT, 'Unknown entity "tbo" - use the "&#;" or "&#x;" syntax', '0:0' ]]); - expect(tokenizeAndHumanizeErrors('&#asdf;')).toEqual([ + expect(tokenizeAndHumanizeErrors('sdf;')).toEqual([ [lex.TokenType.TEXT, 'Unexpected character "s"', '0:3'] ]); expect(tokenizeAndHumanizeErrors(' sdf;')).toEqual([ @@ -891,7 +891,6 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u .toEqual([ [lex.TokenType.TEXT, '\' \" \` \\ \n \n \v \t \b \f'], [lex.TokenType.EOF], - ]); }); @@ -1033,6 +1032,68 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u ]); }); + it('should parse over escaped new line in tag definitions', () => { + const text = ''; + expect(tokenizeAndHumanizeParts(text, {escapedString: true})).toEqual([ + [lex.TokenType.TAG_OPEN_START, '', 't'], + [lex.TokenType.TAG_OPEN_END], + [lex.TokenType.TAG_CLOSE, '', 't'], + [lex.TokenType.EOF], + ]); + }); + + it('should parse over escaped characters in tag definitions', () => { + const text = ''; + expect(tokenizeAndHumanizeParts(text, {escapedString: true})).toEqual([ + [lex.TokenType.TAG_OPEN_START, '', 't'], + [lex.TokenType.TAG_OPEN_END], + [lex.TokenType.TAG_CLOSE, '', 't'], + [lex.TokenType.EOF], + ]); + }); + + it('should unescape characters in tag names', () => { + const text = ''; + expect(tokenizeAndHumanizeParts(text, {escapedString: true})).toEqual([ + [lex.TokenType.TAG_OPEN_START, '', 'td'], + [lex.TokenType.TAG_OPEN_END], + [lex.TokenType.TAG_CLOSE, '', 'td'], + [lex.TokenType.EOF], + ]); + expect(tokenizeAndHumanizeSourceSpans(text, {escapedString: true})).toEqual([ + [lex.TokenType.TAG_OPEN_START, ''], + [lex.TokenType.TAG_CLOSE, ''], + [lex.TokenType.EOF, ''], + ]); + }); + + it('should unescape characters in attributes', () => { + const text = ''; + expect(tokenizeAndHumanizeParts(text, {escapedString: true})).toEqual([ + [lex.TokenType.TAG_OPEN_START, '', 't'], + [lex.TokenType.ATTR_NAME, '', 'd'], + [lex.TokenType.ATTR_QUOTE, '"'], + [lex.TokenType.ATTR_VALUE, 'e'], + [lex.TokenType.ATTR_QUOTE, '"'], + [lex.TokenType.TAG_OPEN_END], + [lex.TokenType.TAG_CLOSE, '', 't'], + [lex.TokenType.EOF], + ]); + }); + + it('should parse over escaped new line in attribute values', () => { + const text = ''; + expect(tokenizeAndHumanizeParts(text, {escapedString: true})).toEqual([ + [lex.TokenType.TAG_OPEN_START, '', 't'], + [lex.TokenType.ATTR_NAME, '', 'a'], + [lex.TokenType.ATTR_VALUE, 'b'], + [lex.TokenType.TAG_OPEN_END], + [lex.TokenType.TAG_CLOSE, '', 't'], + [lex.TokenType.EOF], + ]); + }); + it('should tokenize the correct span when there are escape sequences', () => { const text = 'selector: "app-root",\ntemplate: "line 1\\n\\"line 2\\"\\nline 3",\ninputs: []'; @@ -1057,7 +1118,6 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u 'line 2\\n' + // <- escaped line break 'line 3\\\n' + // <- line continuation ''; - expect(tokenizeAndHumanizeParts(text, {escapedString: true})).toEqual([ [lex.TokenType.TAG_OPEN_START, '', 't'], [lex.TokenType.TAG_OPEN_END], [lex.TokenType.TEXT, 'line 1'], [lex.TokenType.TAG_CLOSE, '', 't'], @@ -1084,7 +1144,7 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u [lex.TokenType.TAG_OPEN_END, '1:2'], [lex.TokenType.TEXT, '1:3'], [lex.TokenType.TAG_CLOSE, '1:9'], - [lex.TokenType.TEXT, '1:14'], // <- escaped newline does not increment the row + [lex.TokenType.TEXT, '1:13'], // <- escaped newline does not increment the row [lex.TokenType.TAG_OPEN_START, '1:15'], [lex.TokenType.TAG_OPEN_END, '1:17'], @@ -1099,8 +1159,8 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u [lex.TokenType.TEXT, '\n'], [lex.TokenType.TAG_OPEN_START, ''], - [lex.TokenType.TEXT, 'line 2'], [lex.TokenType.TAG_CLOSE, '\\'], - [lex.TokenType.TEXT, 'n'], + [lex.TokenType.TEXT, 'line 2'], [lex.TokenType.TAG_CLOSE, ''], + [lex.TokenType.TEXT, '\\n'], [lex.TokenType.TAG_OPEN_START, ''], [lex.TokenType.TEXT, 'line 3\\\n'], [lex.TokenType.TAG_CLOSE, ''],