feat(compiler): record end of expression Token (#33549)

In the past, only the starting index of an expression Token has been
recorded, so a parser could demarkate the span of a token only by the
start locations of two tokens. This may lead to trailing whitespace
being included in the token span:

```html
{{ token1   + token2 }}
   ^^^^^^^^^             recorded span of `token1`
```

It's also not enough for a parser to determine the end of a token by
adding the length of the token value to the token's start location,
because lexed expression values may not exactly reflect the source code.
For example, `"d\\"e"` is lexed as a string token whose value is `d"e`.

Instead, this commit adds a `end` field to expression tokens. `end`
is one past the last index of the token source code. This will enable a
parser to determine the span of a token just by looking at that token.

This is a breaking change because the contructor interface of `Token`
has changed.

Part of #33477.

PR Close #33549
This commit is contained in:
ayazhafiz 2019-11-02 14:15:49 -05:00 committed by Alex Rickabaugh
parent 8410278b6d
commit 19944c2424
2 changed files with 118 additions and 113 deletions

View File

@ -35,7 +35,7 @@ export class Lexer {
export class Token {
constructor(
public index: number, public type: TokenType, public numValue: number,
public index: number, public end: number, public type: TokenType, public numValue: number,
public strValue: string) {}
isCharacter(code: number): boolean {
@ -91,35 +91,35 @@ export class Token {
}
}
function newCharacterToken(index: number, code: number): Token {
return new Token(index, TokenType.Character, code, String.fromCharCode(code));
function newCharacterToken(index: number, end: number, code: number): Token {
return new Token(index, end, TokenType.Character, code, String.fromCharCode(code));
}
function newIdentifierToken(index: number, text: string): Token {
return new Token(index, TokenType.Identifier, 0, text);
function newIdentifierToken(index: number, end: number, text: string): Token {
return new Token(index, end, TokenType.Identifier, 0, text);
}
function newKeywordToken(index: number, text: string): Token {
return new Token(index, TokenType.Keyword, 0, text);
function newKeywordToken(index: number, end: number, text: string): Token {
return new Token(index, end, TokenType.Keyword, 0, text);
}
function newOperatorToken(index: number, text: string): Token {
return new Token(index, TokenType.Operator, 0, text);
function newOperatorToken(index: number, end: number, text: string): Token {
return new Token(index, end, TokenType.Operator, 0, text);
}
function newStringToken(index: number, text: string): Token {
return new Token(index, TokenType.String, 0, text);
function newStringToken(index: number, end: number, text: string): Token {
return new Token(index, end, TokenType.String, 0, text);
}
function newNumberToken(index: number, n: number): Token {
return new Token(index, TokenType.Number, n, '');
function newNumberToken(index: number, end: number, n: number): Token {
return new Token(index, end, TokenType.Number, n, '');
}
function newErrorToken(index: number, message: string): Token {
return new Token(index, TokenType.Error, 0, message);
function newErrorToken(index: number, end: number, message: string): Token {
return new Token(index, end, TokenType.Error, 0, message);
}
export const EOF: Token = new Token(-1, TokenType.Character, 0, '');
export const EOF: Token = new Token(-1, -1, TokenType.Character, 0, '');
class _Scanner {
length: number;
@ -165,7 +165,7 @@ class _Scanner {
case chars.$PERIOD:
this.advance();
return chars.isDigit(this.peek) ? this.scanNumber(start) :
newCharacterToken(start, chars.$PERIOD);
newCharacterToken(start, this.index, chars.$PERIOD);
case chars.$LPAREN:
case chars.$RPAREN:
case chars.$LBRACE:
@ -211,13 +211,13 @@ class _Scanner {
scanCharacter(start: number, code: number): Token {
this.advance();
return newCharacterToken(start, code);
return newCharacterToken(start, this.index, code);
}
scanOperator(start: number, str: string): Token {
this.advance();
return newOperatorToken(start, str);
return newOperatorToken(start, this.index, str);
}
/**
@ -243,7 +243,7 @@ class _Scanner {
this.advance();
str += three;
}
return newOperatorToken(start, str);
return newOperatorToken(start, this.index, str);
}
scanIdentifier(): Token {
@ -251,8 +251,8 @@ class _Scanner {
this.advance();
while (isIdentifierPart(this.peek)) this.advance();
const str: string = this.input.substring(start, this.index);
return KEYWORDS.indexOf(str) > -1 ? newKeywordToken(start, str) :
newIdentifierToken(start, str);
return KEYWORDS.indexOf(str) > -1 ? newKeywordToken(start, this.index, str) :
newIdentifierToken(start, this.index, str);
}
scanNumber(start: number): Token {
@ -275,7 +275,7 @@ class _Scanner {
}
const str: string = this.input.substring(start, this.index);
const value: number = simple ? parseIntAutoRadix(str) : parseFloat(str);
return newNumberToken(start, value);
return newNumberToken(start, this.index, value);
}
scanString(): Token {
@ -321,13 +321,14 @@ class _Scanner {
const last: string = input.substring(marker, this.index);
this.advance(); // Skip terminating quote.
return newStringToken(start, buffer + last);
return newStringToken(start, this.index, buffer + last);
}
error(message: string, offset: number): Token {
const position: number = this.index + offset;
return newErrorToken(
position, `Lexer Error: ${message} at column ${position} in expression [${this.input}]`);
position, this.index,
`Lexer Error: ${message} at column ${position} in expression [${this.input}]`);
}
}

View File

@ -12,48 +12,49 @@ function lex(text: string): any[] {
return new Lexer().tokenize(text);
}
function expectToken(token: any, index: number) {
function expectToken(token: any, index: number, end: number) {
expect(token instanceof Token).toBe(true);
expect(token.index).toEqual(index);
expect(token.end).toEqual(end);
}
function expectCharacterToken(token: any, index: number, character: string) {
function expectCharacterToken(token: any, index: number, end: number, character: string) {
expect(character.length).toBe(1);
expectToken(token, index);
expectToken(token, index, end);
expect(token.isCharacter(character.charCodeAt(0))).toBe(true);
}
function expectOperatorToken(token: any, index: number, operator: string) {
expectToken(token, index);
function expectOperatorToken(token: any, index: number, end: number, operator: string) {
expectToken(token, index, end);
expect(token.isOperator(operator)).toBe(true);
}
function expectNumberToken(token: any, index: number, n: number) {
expectToken(token, index);
function expectNumberToken(token: any, index: number, end: number, n: number) {
expectToken(token, index, end);
expect(token.isNumber()).toBe(true);
expect(token.toNumber()).toEqual(n);
}
function expectStringToken(token: any, index: number, str: string) {
expectToken(token, index);
function expectStringToken(token: any, index: number, end: number, str: string) {
expectToken(token, index, end);
expect(token.isString()).toBe(true);
expect(token.toString()).toEqual(str);
}
function expectIdentifierToken(token: any, index: number, identifier: string) {
expectToken(token, index);
function expectIdentifierToken(token: any, index: number, end: number, identifier: string) {
expectToken(token, index, end);
expect(token.isIdentifier()).toBe(true);
expect(token.toString()).toEqual(identifier);
}
function expectKeywordToken(token: any, index: number, keyword: string) {
expectToken(token, index);
function expectKeywordToken(token: any, index: number, end: number, keyword: string) {
expectToken(token, index, end);
expect(token.isKeyword()).toBe(true);
expect(token.toString()).toEqual(keyword);
}
function expectErrorToken(token: Token, index: any, message: string) {
expectToken(token, index);
function expectErrorToken(token: Token, index: any, end: number, message: string) {
expectToken(token, index, end);
expect(token.isError()).toBe(true);
expect(token.toString()).toEqual(message);
}
@ -64,88 +65,88 @@ function expectErrorToken(token: Token, index: any, message: string) {
it('should tokenize a simple identifier', () => {
const tokens: number[] = lex('j');
expect(tokens.length).toEqual(1);
expectIdentifierToken(tokens[0], 0, 'j');
expectIdentifierToken(tokens[0], 0, 1, 'j');
});
it('should tokenize "this"', () => {
const tokens: number[] = lex('this');
expect(tokens.length).toEqual(1);
expectKeywordToken(tokens[0], 0, 'this');
expectKeywordToken(tokens[0], 0, 4, 'this');
});
it('should tokenize a dotted identifier', () => {
const tokens: number[] = lex('j.k');
expect(tokens.length).toEqual(3);
expectIdentifierToken(tokens[0], 0, 'j');
expectCharacterToken(tokens[1], 1, '.');
expectIdentifierToken(tokens[2], 2, 'k');
expectIdentifierToken(tokens[0], 0, 1, 'j');
expectCharacterToken(tokens[1], 1, 2, '.');
expectIdentifierToken(tokens[2], 2, 3, 'k');
});
it('should tokenize an operator', () => {
const tokens: number[] = lex('j-k');
expect(tokens.length).toEqual(3);
expectOperatorToken(tokens[1], 1, '-');
expectOperatorToken(tokens[1], 1, 2, '-');
});
it('should tokenize an indexed operator', () => {
const tokens: number[] = lex('j[k]');
expect(tokens.length).toEqual(4);
expectCharacterToken(tokens[1], 1, '[');
expectCharacterToken(tokens[3], 3, ']');
expectCharacterToken(tokens[1], 1, 2, '[');
expectCharacterToken(tokens[3], 3, 4, ']');
});
it('should tokenize numbers', () => {
const tokens: number[] = lex('88');
expect(tokens.length).toEqual(1);
expectNumberToken(tokens[0], 0, 88);
expectNumberToken(tokens[0], 0, 2, 88);
});
it('should tokenize numbers within index ops',
() => { expectNumberToken(lex('a[22]')[2], 2, 22); });
() => { expectNumberToken(lex('a[22]')[2], 2, 4, 22); });
it('should tokenize simple quoted strings',
() => { expectStringToken(lex('"a"')[0], 0, 'a'); });
() => { expectStringToken(lex('"a"')[0], 0, 3, 'a'); });
it('should tokenize quoted strings with escaped quotes',
() => { expectStringToken(lex('"a\\""')[0], 0, 'a"'); });
() => { expectStringToken(lex('"a\\""')[0], 0, 5, 'a"'); });
it('should tokenize a string', () => {
const tokens: Token[] = lex('j-a.bc[22]+1.3|f:\'a\\\'c\':"d\\"e"');
expectIdentifierToken(tokens[0], 0, 'j');
expectOperatorToken(tokens[1], 1, '-');
expectIdentifierToken(tokens[2], 2, 'a');
expectCharacterToken(tokens[3], 3, '.');
expectIdentifierToken(tokens[4], 4, 'bc');
expectCharacterToken(tokens[5], 6, '[');
expectNumberToken(tokens[6], 7, 22);
expectCharacterToken(tokens[7], 9, ']');
expectOperatorToken(tokens[8], 10, '+');
expectNumberToken(tokens[9], 11, 1.3);
expectOperatorToken(tokens[10], 14, '|');
expectIdentifierToken(tokens[11], 15, 'f');
expectCharacterToken(tokens[12], 16, ':');
expectStringToken(tokens[13], 17, 'a\'c');
expectCharacterToken(tokens[14], 23, ':');
expectStringToken(tokens[15], 24, 'd"e');
expectIdentifierToken(tokens[0], 0, 1, 'j');
expectOperatorToken(tokens[1], 1, 2, '-');
expectIdentifierToken(tokens[2], 2, 3, 'a');
expectCharacterToken(tokens[3], 3, 4, '.');
expectIdentifierToken(tokens[4], 4, 6, 'bc');
expectCharacterToken(tokens[5], 6, 7, '[');
expectNumberToken(tokens[6], 7, 9, 22);
expectCharacterToken(tokens[7], 9, 10, ']');
expectOperatorToken(tokens[8], 10, 11, '+');
expectNumberToken(tokens[9], 11, 14, 1.3);
expectOperatorToken(tokens[10], 14, 15, '|');
expectIdentifierToken(tokens[11], 15, 16, 'f');
expectCharacterToken(tokens[12], 16, 17, ':');
expectStringToken(tokens[13], 17, 23, 'a\'c');
expectCharacterToken(tokens[14], 23, 24, ':');
expectStringToken(tokens[15], 24, 30, 'd"e');
});
it('should tokenize undefined', () => {
const tokens: Token[] = lex('undefined');
expectKeywordToken(tokens[0], 0, 'undefined');
expectKeywordToken(tokens[0], 0, 9, 'undefined');
expect(tokens[0].isKeywordUndefined()).toBe(true);
});
it('should ignore whitespace', () => {
const tokens: Token[] = lex('a \t \n \r b');
expectIdentifierToken(tokens[0], 0, 'a');
expectIdentifierToken(tokens[1], 8, 'b');
expectIdentifierToken(tokens[0], 0, 1, 'a');
expectIdentifierToken(tokens[1], 8, 9, 'b');
});
it('should tokenize quoted string', () => {
const str = '[\'\\\'\', "\\""]';
const tokens: Token[] = lex(str);
expectStringToken(tokens[1], 1, '\'');
expectStringToken(tokens[3], 7, '"');
expectStringToken(tokens[1], 1, 5, '\'');
expectStringToken(tokens[3], 7, 11, '"');
});
it('should tokenize escaped quoted string', () => {
@ -163,86 +164,89 @@ function expectErrorToken(token: Token, index: any, message: string) {
it('should tokenize relation', () => {
const tokens: Token[] = lex('! == != < > <= >= === !==');
expectOperatorToken(tokens[0], 0, '!');
expectOperatorToken(tokens[1], 2, '==');
expectOperatorToken(tokens[2], 5, '!=');
expectOperatorToken(tokens[3], 8, '<');
expectOperatorToken(tokens[4], 10, '>');
expectOperatorToken(tokens[5], 12, '<=');
expectOperatorToken(tokens[6], 15, '>=');
expectOperatorToken(tokens[7], 18, '===');
expectOperatorToken(tokens[8], 22, '!==');
expectOperatorToken(tokens[0], 0, 1, '!');
expectOperatorToken(tokens[1], 2, 4, '==');
expectOperatorToken(tokens[2], 5, 7, '!=');
expectOperatorToken(tokens[3], 8, 9, '<');
expectOperatorToken(tokens[4], 10, 11, '>');
expectOperatorToken(tokens[5], 12, 14, '<=');
expectOperatorToken(tokens[6], 15, 17, '>=');
expectOperatorToken(tokens[7], 18, 21, '===');
expectOperatorToken(tokens[8], 22, 25, '!==');
});
it('should tokenize statements', () => {
const tokens: Token[] = lex('a;b;');
expectIdentifierToken(tokens[0], 0, 'a');
expectCharacterToken(tokens[1], 1, ';');
expectIdentifierToken(tokens[2], 2, 'b');
expectCharacterToken(tokens[3], 3, ';');
expectIdentifierToken(tokens[0], 0, 1, 'a');
expectCharacterToken(tokens[1], 1, 2, ';');
expectIdentifierToken(tokens[2], 2, 3, 'b');
expectCharacterToken(tokens[3], 3, 4, ';');
});
it('should tokenize function invocation', () => {
const tokens: Token[] = lex('a()');
expectIdentifierToken(tokens[0], 0, 'a');
expectCharacterToken(tokens[1], 1, '(');
expectCharacterToken(tokens[2], 2, ')');
expectIdentifierToken(tokens[0], 0, 1, 'a');
expectCharacterToken(tokens[1], 1, 2, '(');
expectCharacterToken(tokens[2], 2, 3, ')');
});
it('should tokenize simple method invocations', () => {
const tokens: Token[] = lex('a.method()');
expectIdentifierToken(tokens[2], 2, 'method');
expectIdentifierToken(tokens[2], 2, 8, 'method');
});
it('should tokenize method invocation', () => {
const tokens: Token[] = lex('a.b.c (d) - e.f()');
expectIdentifierToken(tokens[0], 0, 'a');
expectCharacterToken(tokens[1], 1, '.');
expectIdentifierToken(tokens[2], 2, 'b');
expectCharacterToken(tokens[3], 3, '.');
expectIdentifierToken(tokens[4], 4, 'c');
expectCharacterToken(tokens[5], 6, '(');
expectIdentifierToken(tokens[6], 7, 'd');
expectCharacterToken(tokens[7], 8, ')');
expectOperatorToken(tokens[8], 10, '-');
expectIdentifierToken(tokens[9], 12, 'e');
expectCharacterToken(tokens[10], 13, '.');
expectIdentifierToken(tokens[11], 14, 'f');
expectCharacterToken(tokens[12], 15, '(');
expectCharacterToken(tokens[13], 16, ')');
expectIdentifierToken(tokens[0], 0, 1, 'a');
expectCharacterToken(tokens[1], 1, 2, '.');
expectIdentifierToken(tokens[2], 2, 3, 'b');
expectCharacterToken(tokens[3], 3, 4, '.');
expectIdentifierToken(tokens[4], 4, 5, 'c');
expectCharacterToken(tokens[5], 6, 7, '(');
expectIdentifierToken(tokens[6], 7, 8, 'd');
expectCharacterToken(tokens[7], 8, 9, ')');
expectOperatorToken(tokens[8], 10, 11, '-');
expectIdentifierToken(tokens[9], 12, 13, 'e');
expectCharacterToken(tokens[10], 13, 14, '.');
expectIdentifierToken(tokens[11], 14, 15, 'f');
expectCharacterToken(tokens[12], 15, 16, '(');
expectCharacterToken(tokens[13], 16, 17, ')');
});
it('should tokenize number', () => { expectNumberToken(lex('0.5')[0], 0, 0.5); });
it('should tokenize number', () => { expectNumberToken(lex('0.5')[0], 0, 3, 0.5); });
it('should tokenize number with exponent', () => {
let tokens: Token[] = lex('0.5E-10');
expect(tokens.length).toEqual(1);
expectNumberToken(tokens[0], 0, 0.5E-10);
expectNumberToken(tokens[0], 0, 7, 0.5E-10);
tokens = lex('0.5E+10');
expectNumberToken(tokens[0], 0, 0.5E+10);
expectNumberToken(tokens[0], 0, 7, 0.5E+10);
});
it('should return exception for invalid exponent', () => {
expectErrorToken(
lex('0.5E-')[0], 4, 'Lexer Error: Invalid exponent at column 4 in expression [0.5E-]');
lex('0.5E-')[0], 4, 5,
'Lexer Error: Invalid exponent at column 4 in expression [0.5E-]');
expectErrorToken(
lex('0.5E-A')[0], 4,
lex('0.5E-A')[0], 4, 5,
'Lexer Error: Invalid exponent at column 4 in expression [0.5E-A]');
});
it('should tokenize number starting with a dot',
() => { expectNumberToken(lex('.5')[0], 0, 0.5); });
() => { expectNumberToken(lex('.5')[0], 0, 2, 0.5); });
it('should throw error on invalid unicode', () => {
expectErrorToken(
lex('\'\\u1\'\'bla\'')[0], 2,
lex('\'\\u1\'\'bla\'')[0], 2, 2,
'Lexer Error: Invalid unicode escape [\\u1\'\'b] at column 2 in expression [\'\\u1\'\'bla\']');
});
it('should tokenize hash as operator', () => { expectOperatorToken(lex('#')[0], 0, '#'); });
it('should tokenize hash as operator',
() => { expectOperatorToken(lex('#')[0], 0, 1, '#'); });
it('should tokenize ?. as operator', () => { expectOperatorToken(lex('?.')[0], 0, '?.'); });
it('should tokenize ?. as operator',
() => { expectOperatorToken(lex('?.')[0], 0, 2, '?.'); });
});
});
}