feat(compiler): record end of expression Token (#33549)

In the past, only the starting index of an expression Token has been
recorded, so a parser could demarkate the span of a token only by the
start locations of two tokens. This may lead to trailing whitespace
being included in the token span:

```html
{{ token1   + token2 }}
   ^^^^^^^^^             recorded span of `token1`
```

It's also not enough for a parser to determine the end of a token by
adding the length of the token value to the token's start location,
because lexed expression values may not exactly reflect the source code.
For example, `"d\\"e"` is lexed as a string token whose value is `d"e`.

Instead, this commit adds a `end` field to expression tokens. `end`
is one past the last index of the token source code. This will enable a
parser to determine the span of a token just by looking at that token.

This is a breaking change because the contructor interface of `Token`
has changed.

Part of #33477.

PR Close #33549
This commit is contained in:
ayazhafiz 2019-11-02 14:15:49 -05:00 committed by Alex Rickabaugh
parent 8410278b6d
commit 19944c2424
2 changed files with 118 additions and 113 deletions

View File

@ -35,7 +35,7 @@ export class Lexer {
export class Token { export class Token {
constructor( constructor(
public index: number, public type: TokenType, public numValue: number, public index: number, public end: number, public type: TokenType, public numValue: number,
public strValue: string) {} public strValue: string) {}
isCharacter(code: number): boolean { isCharacter(code: number): boolean {
@ -91,35 +91,35 @@ export class Token {
} }
} }
function newCharacterToken(index: number, code: number): Token { function newCharacterToken(index: number, end: number, code: number): Token {
return new Token(index, TokenType.Character, code, String.fromCharCode(code)); return new Token(index, end, TokenType.Character, code, String.fromCharCode(code));
} }
function newIdentifierToken(index: number, text: string): Token { function newIdentifierToken(index: number, end: number, text: string): Token {
return new Token(index, TokenType.Identifier, 0, text); return new Token(index, end, TokenType.Identifier, 0, text);
} }
function newKeywordToken(index: number, text: string): Token { function newKeywordToken(index: number, end: number, text: string): Token {
return new Token(index, TokenType.Keyword, 0, text); return new Token(index, end, TokenType.Keyword, 0, text);
} }
function newOperatorToken(index: number, text: string): Token { function newOperatorToken(index: number, end: number, text: string): Token {
return new Token(index, TokenType.Operator, 0, text); return new Token(index, end, TokenType.Operator, 0, text);
} }
function newStringToken(index: number, text: string): Token { function newStringToken(index: number, end: number, text: string): Token {
return new Token(index, TokenType.String, 0, text); return new Token(index, end, TokenType.String, 0, text);
} }
function newNumberToken(index: number, n: number): Token { function newNumberToken(index: number, end: number, n: number): Token {
return new Token(index, TokenType.Number, n, ''); return new Token(index, end, TokenType.Number, n, '');
} }
function newErrorToken(index: number, message: string): Token { function newErrorToken(index: number, end: number, message: string): Token {
return new Token(index, TokenType.Error, 0, message); return new Token(index, end, TokenType.Error, 0, message);
} }
export const EOF: Token = new Token(-1, TokenType.Character, 0, ''); export const EOF: Token = new Token(-1, -1, TokenType.Character, 0, '');
class _Scanner { class _Scanner {
length: number; length: number;
@ -165,7 +165,7 @@ class _Scanner {
case chars.$PERIOD: case chars.$PERIOD:
this.advance(); this.advance();
return chars.isDigit(this.peek) ? this.scanNumber(start) : return chars.isDigit(this.peek) ? this.scanNumber(start) :
newCharacterToken(start, chars.$PERIOD); newCharacterToken(start, this.index, chars.$PERIOD);
case chars.$LPAREN: case chars.$LPAREN:
case chars.$RPAREN: case chars.$RPAREN:
case chars.$LBRACE: case chars.$LBRACE:
@ -211,13 +211,13 @@ class _Scanner {
scanCharacter(start: number, code: number): Token { scanCharacter(start: number, code: number): Token {
this.advance(); this.advance();
return newCharacterToken(start, code); return newCharacterToken(start, this.index, code);
} }
scanOperator(start: number, str: string): Token { scanOperator(start: number, str: string): Token {
this.advance(); this.advance();
return newOperatorToken(start, str); return newOperatorToken(start, this.index, str);
} }
/** /**
@ -243,7 +243,7 @@ class _Scanner {
this.advance(); this.advance();
str += three; str += three;
} }
return newOperatorToken(start, str); return newOperatorToken(start, this.index, str);
} }
scanIdentifier(): Token { scanIdentifier(): Token {
@ -251,8 +251,8 @@ class _Scanner {
this.advance(); this.advance();
while (isIdentifierPart(this.peek)) this.advance(); while (isIdentifierPart(this.peek)) this.advance();
const str: string = this.input.substring(start, this.index); const str: string = this.input.substring(start, this.index);
return KEYWORDS.indexOf(str) > -1 ? newKeywordToken(start, str) : return KEYWORDS.indexOf(str) > -1 ? newKeywordToken(start, this.index, str) :
newIdentifierToken(start, str); newIdentifierToken(start, this.index, str);
} }
scanNumber(start: number): Token { scanNumber(start: number): Token {
@ -275,7 +275,7 @@ class _Scanner {
} }
const str: string = this.input.substring(start, this.index); const str: string = this.input.substring(start, this.index);
const value: number = simple ? parseIntAutoRadix(str) : parseFloat(str); const value: number = simple ? parseIntAutoRadix(str) : parseFloat(str);
return newNumberToken(start, value); return newNumberToken(start, this.index, value);
} }
scanString(): Token { scanString(): Token {
@ -321,13 +321,14 @@ class _Scanner {
const last: string = input.substring(marker, this.index); const last: string = input.substring(marker, this.index);
this.advance(); // Skip terminating quote. this.advance(); // Skip terminating quote.
return newStringToken(start, buffer + last); return newStringToken(start, this.index, buffer + last);
} }
error(message: string, offset: number): Token { error(message: string, offset: number): Token {
const position: number = this.index + offset; const position: number = this.index + offset;
return newErrorToken( return newErrorToken(
position, `Lexer Error: ${message} at column ${position} in expression [${this.input}]`); position, this.index,
`Lexer Error: ${message} at column ${position} in expression [${this.input}]`);
} }
} }

View File

@ -12,48 +12,49 @@ function lex(text: string): any[] {
return new Lexer().tokenize(text); return new Lexer().tokenize(text);
} }
function expectToken(token: any, index: number) { function expectToken(token: any, index: number, end: number) {
expect(token instanceof Token).toBe(true); expect(token instanceof Token).toBe(true);
expect(token.index).toEqual(index); expect(token.index).toEqual(index);
expect(token.end).toEqual(end);
} }
function expectCharacterToken(token: any, index: number, character: string) { function expectCharacterToken(token: any, index: number, end: number, character: string) {
expect(character.length).toBe(1); expect(character.length).toBe(1);
expectToken(token, index); expectToken(token, index, end);
expect(token.isCharacter(character.charCodeAt(0))).toBe(true); expect(token.isCharacter(character.charCodeAt(0))).toBe(true);
} }
function expectOperatorToken(token: any, index: number, operator: string) { function expectOperatorToken(token: any, index: number, end: number, operator: string) {
expectToken(token, index); expectToken(token, index, end);
expect(token.isOperator(operator)).toBe(true); expect(token.isOperator(operator)).toBe(true);
} }
function expectNumberToken(token: any, index: number, n: number) { function expectNumberToken(token: any, index: number, end: number, n: number) {
expectToken(token, index); expectToken(token, index, end);
expect(token.isNumber()).toBe(true); expect(token.isNumber()).toBe(true);
expect(token.toNumber()).toEqual(n); expect(token.toNumber()).toEqual(n);
} }
function expectStringToken(token: any, index: number, str: string) { function expectStringToken(token: any, index: number, end: number, str: string) {
expectToken(token, index); expectToken(token, index, end);
expect(token.isString()).toBe(true); expect(token.isString()).toBe(true);
expect(token.toString()).toEqual(str); expect(token.toString()).toEqual(str);
} }
function expectIdentifierToken(token: any, index: number, identifier: string) { function expectIdentifierToken(token: any, index: number, end: number, identifier: string) {
expectToken(token, index); expectToken(token, index, end);
expect(token.isIdentifier()).toBe(true); expect(token.isIdentifier()).toBe(true);
expect(token.toString()).toEqual(identifier); expect(token.toString()).toEqual(identifier);
} }
function expectKeywordToken(token: any, index: number, keyword: string) { function expectKeywordToken(token: any, index: number, end: number, keyword: string) {
expectToken(token, index); expectToken(token, index, end);
expect(token.isKeyword()).toBe(true); expect(token.isKeyword()).toBe(true);
expect(token.toString()).toEqual(keyword); expect(token.toString()).toEqual(keyword);
} }
function expectErrorToken(token: Token, index: any, message: string) { function expectErrorToken(token: Token, index: any, end: number, message: string) {
expectToken(token, index); expectToken(token, index, end);
expect(token.isError()).toBe(true); expect(token.isError()).toBe(true);
expect(token.toString()).toEqual(message); expect(token.toString()).toEqual(message);
} }
@ -64,88 +65,88 @@ function expectErrorToken(token: Token, index: any, message: string) {
it('should tokenize a simple identifier', () => { it('should tokenize a simple identifier', () => {
const tokens: number[] = lex('j'); const tokens: number[] = lex('j');
expect(tokens.length).toEqual(1); expect(tokens.length).toEqual(1);
expectIdentifierToken(tokens[0], 0, 'j'); expectIdentifierToken(tokens[0], 0, 1, 'j');
}); });
it('should tokenize "this"', () => { it('should tokenize "this"', () => {
const tokens: number[] = lex('this'); const tokens: number[] = lex('this');
expect(tokens.length).toEqual(1); expect(tokens.length).toEqual(1);
expectKeywordToken(tokens[0], 0, 'this'); expectKeywordToken(tokens[0], 0, 4, 'this');
}); });
it('should tokenize a dotted identifier', () => { it('should tokenize a dotted identifier', () => {
const tokens: number[] = lex('j.k'); const tokens: number[] = lex('j.k');
expect(tokens.length).toEqual(3); expect(tokens.length).toEqual(3);
expectIdentifierToken(tokens[0], 0, 'j'); expectIdentifierToken(tokens[0], 0, 1, 'j');
expectCharacterToken(tokens[1], 1, '.'); expectCharacterToken(tokens[1], 1, 2, '.');
expectIdentifierToken(tokens[2], 2, 'k'); expectIdentifierToken(tokens[2], 2, 3, 'k');
}); });
it('should tokenize an operator', () => { it('should tokenize an operator', () => {
const tokens: number[] = lex('j-k'); const tokens: number[] = lex('j-k');
expect(tokens.length).toEqual(3); expect(tokens.length).toEqual(3);
expectOperatorToken(tokens[1], 1, '-'); expectOperatorToken(tokens[1], 1, 2, '-');
}); });
it('should tokenize an indexed operator', () => { it('should tokenize an indexed operator', () => {
const tokens: number[] = lex('j[k]'); const tokens: number[] = lex('j[k]');
expect(tokens.length).toEqual(4); expect(tokens.length).toEqual(4);
expectCharacterToken(tokens[1], 1, '['); expectCharacterToken(tokens[1], 1, 2, '[');
expectCharacterToken(tokens[3], 3, ']'); expectCharacterToken(tokens[3], 3, 4, ']');
}); });
it('should tokenize numbers', () => { it('should tokenize numbers', () => {
const tokens: number[] = lex('88'); const tokens: number[] = lex('88');
expect(tokens.length).toEqual(1); expect(tokens.length).toEqual(1);
expectNumberToken(tokens[0], 0, 88); expectNumberToken(tokens[0], 0, 2, 88);
}); });
it('should tokenize numbers within index ops', it('should tokenize numbers within index ops',
() => { expectNumberToken(lex('a[22]')[2], 2, 22); }); () => { expectNumberToken(lex('a[22]')[2], 2, 4, 22); });
it('should tokenize simple quoted strings', it('should tokenize simple quoted strings',
() => { expectStringToken(lex('"a"')[0], 0, 'a'); }); () => { expectStringToken(lex('"a"')[0], 0, 3, 'a'); });
it('should tokenize quoted strings with escaped quotes', it('should tokenize quoted strings with escaped quotes',
() => { expectStringToken(lex('"a\\""')[0], 0, 'a"'); }); () => { expectStringToken(lex('"a\\""')[0], 0, 5, 'a"'); });
it('should tokenize a string', () => { it('should tokenize a string', () => {
const tokens: Token[] = lex('j-a.bc[22]+1.3|f:\'a\\\'c\':"d\\"e"'); const tokens: Token[] = lex('j-a.bc[22]+1.3|f:\'a\\\'c\':"d\\"e"');
expectIdentifierToken(tokens[0], 0, 'j'); expectIdentifierToken(tokens[0], 0, 1, 'j');
expectOperatorToken(tokens[1], 1, '-'); expectOperatorToken(tokens[1], 1, 2, '-');
expectIdentifierToken(tokens[2], 2, 'a'); expectIdentifierToken(tokens[2], 2, 3, 'a');
expectCharacterToken(tokens[3], 3, '.'); expectCharacterToken(tokens[3], 3, 4, '.');
expectIdentifierToken(tokens[4], 4, 'bc'); expectIdentifierToken(tokens[4], 4, 6, 'bc');
expectCharacterToken(tokens[5], 6, '['); expectCharacterToken(tokens[5], 6, 7, '[');
expectNumberToken(tokens[6], 7, 22); expectNumberToken(tokens[6], 7, 9, 22);
expectCharacterToken(tokens[7], 9, ']'); expectCharacterToken(tokens[7], 9, 10, ']');
expectOperatorToken(tokens[8], 10, '+'); expectOperatorToken(tokens[8], 10, 11, '+');
expectNumberToken(tokens[9], 11, 1.3); expectNumberToken(tokens[9], 11, 14, 1.3);
expectOperatorToken(tokens[10], 14, '|'); expectOperatorToken(tokens[10], 14, 15, '|');
expectIdentifierToken(tokens[11], 15, 'f'); expectIdentifierToken(tokens[11], 15, 16, 'f');
expectCharacterToken(tokens[12], 16, ':'); expectCharacterToken(tokens[12], 16, 17, ':');
expectStringToken(tokens[13], 17, 'a\'c'); expectStringToken(tokens[13], 17, 23, 'a\'c');
expectCharacterToken(tokens[14], 23, ':'); expectCharacterToken(tokens[14], 23, 24, ':');
expectStringToken(tokens[15], 24, 'd"e'); expectStringToken(tokens[15], 24, 30, 'd"e');
}); });
it('should tokenize undefined', () => { it('should tokenize undefined', () => {
const tokens: Token[] = lex('undefined'); const tokens: Token[] = lex('undefined');
expectKeywordToken(tokens[0], 0, 'undefined'); expectKeywordToken(tokens[0], 0, 9, 'undefined');
expect(tokens[0].isKeywordUndefined()).toBe(true); expect(tokens[0].isKeywordUndefined()).toBe(true);
}); });
it('should ignore whitespace', () => { it('should ignore whitespace', () => {
const tokens: Token[] = lex('a \t \n \r b'); const tokens: Token[] = lex('a \t \n \r b');
expectIdentifierToken(tokens[0], 0, 'a'); expectIdentifierToken(tokens[0], 0, 1, 'a');
expectIdentifierToken(tokens[1], 8, 'b'); expectIdentifierToken(tokens[1], 8, 9, 'b');
}); });
it('should tokenize quoted string', () => { it('should tokenize quoted string', () => {
const str = '[\'\\\'\', "\\""]'; const str = '[\'\\\'\', "\\""]';
const tokens: Token[] = lex(str); const tokens: Token[] = lex(str);
expectStringToken(tokens[1], 1, '\''); expectStringToken(tokens[1], 1, 5, '\'');
expectStringToken(tokens[3], 7, '"'); expectStringToken(tokens[3], 7, 11, '"');
}); });
it('should tokenize escaped quoted string', () => { it('should tokenize escaped quoted string', () => {
@ -163,86 +164,89 @@ function expectErrorToken(token: Token, index: any, message: string) {
it('should tokenize relation', () => { it('should tokenize relation', () => {
const tokens: Token[] = lex('! == != < > <= >= === !=='); const tokens: Token[] = lex('! == != < > <= >= === !==');
expectOperatorToken(tokens[0], 0, '!'); expectOperatorToken(tokens[0], 0, 1, '!');
expectOperatorToken(tokens[1], 2, '=='); expectOperatorToken(tokens[1], 2, 4, '==');
expectOperatorToken(tokens[2], 5, '!='); expectOperatorToken(tokens[2], 5, 7, '!=');
expectOperatorToken(tokens[3], 8, '<'); expectOperatorToken(tokens[3], 8, 9, '<');
expectOperatorToken(tokens[4], 10, '>'); expectOperatorToken(tokens[4], 10, 11, '>');
expectOperatorToken(tokens[5], 12, '<='); expectOperatorToken(tokens[5], 12, 14, '<=');
expectOperatorToken(tokens[6], 15, '>='); expectOperatorToken(tokens[6], 15, 17, '>=');
expectOperatorToken(tokens[7], 18, '==='); expectOperatorToken(tokens[7], 18, 21, '===');
expectOperatorToken(tokens[8], 22, '!=='); expectOperatorToken(tokens[8], 22, 25, '!==');
}); });
it('should tokenize statements', () => { it('should tokenize statements', () => {
const tokens: Token[] = lex('a;b;'); const tokens: Token[] = lex('a;b;');
expectIdentifierToken(tokens[0], 0, 'a'); expectIdentifierToken(tokens[0], 0, 1, 'a');
expectCharacterToken(tokens[1], 1, ';'); expectCharacterToken(tokens[1], 1, 2, ';');
expectIdentifierToken(tokens[2], 2, 'b'); expectIdentifierToken(tokens[2], 2, 3, 'b');
expectCharacterToken(tokens[3], 3, ';'); expectCharacterToken(tokens[3], 3, 4, ';');
}); });
it('should tokenize function invocation', () => { it('should tokenize function invocation', () => {
const tokens: Token[] = lex('a()'); const tokens: Token[] = lex('a()');
expectIdentifierToken(tokens[0], 0, 'a'); expectIdentifierToken(tokens[0], 0, 1, 'a');
expectCharacterToken(tokens[1], 1, '('); expectCharacterToken(tokens[1], 1, 2, '(');
expectCharacterToken(tokens[2], 2, ')'); expectCharacterToken(tokens[2], 2, 3, ')');
}); });
it('should tokenize simple method invocations', () => { it('should tokenize simple method invocations', () => {
const tokens: Token[] = lex('a.method()'); const tokens: Token[] = lex('a.method()');
expectIdentifierToken(tokens[2], 2, 'method'); expectIdentifierToken(tokens[2], 2, 8, 'method');
}); });
it('should tokenize method invocation', () => { it('should tokenize method invocation', () => {
const tokens: Token[] = lex('a.b.c (d) - e.f()'); const tokens: Token[] = lex('a.b.c (d) - e.f()');
expectIdentifierToken(tokens[0], 0, 'a'); expectIdentifierToken(tokens[0], 0, 1, 'a');
expectCharacterToken(tokens[1], 1, '.'); expectCharacterToken(tokens[1], 1, 2, '.');
expectIdentifierToken(tokens[2], 2, 'b'); expectIdentifierToken(tokens[2], 2, 3, 'b');
expectCharacterToken(tokens[3], 3, '.'); expectCharacterToken(tokens[3], 3, 4, '.');
expectIdentifierToken(tokens[4], 4, 'c'); expectIdentifierToken(tokens[4], 4, 5, 'c');
expectCharacterToken(tokens[5], 6, '('); expectCharacterToken(tokens[5], 6, 7, '(');
expectIdentifierToken(tokens[6], 7, 'd'); expectIdentifierToken(tokens[6], 7, 8, 'd');
expectCharacterToken(tokens[7], 8, ')'); expectCharacterToken(tokens[7], 8, 9, ')');
expectOperatorToken(tokens[8], 10, '-'); expectOperatorToken(tokens[8], 10, 11, '-');
expectIdentifierToken(tokens[9], 12, 'e'); expectIdentifierToken(tokens[9], 12, 13, 'e');
expectCharacterToken(tokens[10], 13, '.'); expectCharacterToken(tokens[10], 13, 14, '.');
expectIdentifierToken(tokens[11], 14, 'f'); expectIdentifierToken(tokens[11], 14, 15, 'f');
expectCharacterToken(tokens[12], 15, '('); expectCharacterToken(tokens[12], 15, 16, '(');
expectCharacterToken(tokens[13], 16, ')'); expectCharacterToken(tokens[13], 16, 17, ')');
}); });
it('should tokenize number', () => { expectNumberToken(lex('0.5')[0], 0, 0.5); }); it('should tokenize number', () => { expectNumberToken(lex('0.5')[0], 0, 3, 0.5); });
it('should tokenize number with exponent', () => { it('should tokenize number with exponent', () => {
let tokens: Token[] = lex('0.5E-10'); let tokens: Token[] = lex('0.5E-10');
expect(tokens.length).toEqual(1); expect(tokens.length).toEqual(1);
expectNumberToken(tokens[0], 0, 0.5E-10); expectNumberToken(tokens[0], 0, 7, 0.5E-10);
tokens = lex('0.5E+10'); tokens = lex('0.5E+10');
expectNumberToken(tokens[0], 0, 0.5E+10); expectNumberToken(tokens[0], 0, 7, 0.5E+10);
}); });
it('should return exception for invalid exponent', () => { it('should return exception for invalid exponent', () => {
expectErrorToken( expectErrorToken(
lex('0.5E-')[0], 4, 'Lexer Error: Invalid exponent at column 4 in expression [0.5E-]'); lex('0.5E-')[0], 4, 5,
'Lexer Error: Invalid exponent at column 4 in expression [0.5E-]');
expectErrorToken( expectErrorToken(
lex('0.5E-A')[0], 4, lex('0.5E-A')[0], 4, 5,
'Lexer Error: Invalid exponent at column 4 in expression [0.5E-A]'); 'Lexer Error: Invalid exponent at column 4 in expression [0.5E-A]');
}); });
it('should tokenize number starting with a dot', it('should tokenize number starting with a dot',
() => { expectNumberToken(lex('.5')[0], 0, 0.5); }); () => { expectNumberToken(lex('.5')[0], 0, 2, 0.5); });
it('should throw error on invalid unicode', () => { it('should throw error on invalid unicode', () => {
expectErrorToken( expectErrorToken(
lex('\'\\u1\'\'bla\'')[0], 2, lex('\'\\u1\'\'bla\'')[0], 2, 2,
'Lexer Error: Invalid unicode escape [\\u1\'\'b] at column 2 in expression [\'\\u1\'\'bla\']'); 'Lexer Error: Invalid unicode escape [\\u1\'\'b] at column 2 in expression [\'\\u1\'\'bla\']');
}); });
it('should tokenize hash as operator', () => { expectOperatorToken(lex('#')[0], 0, '#'); }); it('should tokenize hash as operator',
() => { expectOperatorToken(lex('#')[0], 0, 1, '#'); });
it('should tokenize ?. as operator', () => { expectOperatorToken(lex('?.')[0], 0, '?.'); }); it('should tokenize ?. as operator',
() => { expectOperatorToken(lex('?.')[0], 0, 2, '?.'); });
}); });
}); });
} }