feat(lexer): initial (wip) implementation.

This commit is contained in:
Chirayu Krishnappa 2014-09-26 13:52:12 -07:00
parent cff47d4f8e
commit c85ab3a5a4
7 changed files with 658 additions and 2 deletions

View File

@ -0,0 +1,460 @@
import {List, ListWrapper, SetWrapper} from "facade/collection";
import {FIELD, NumberWrapper, StringJoiner, StringWrapper} from "facade/lang";
// TODO(chirayu): Rewrite as consts when possible.
export var TOKEN_TYPE_CHARACTER = 1;
export var TOKEN_TYPE_IDENTIFIER = 2;
export var TOKEN_TYPE_KEYWORD = 3;
export var TOKEN_TYPE_STRING = 4;
export var TOKEN_TYPE_OPERATOR = 5;
export var TOKEN_TYPE_NUMBER = 6;
export class Token {
@FIELD('final index:int')
@FIELD('final type:int')
@FIELD('final _intValue:int')
@FIELD('final _strValue:int')
constructor(index:number/*int*/, type:number/*int*/, intValue:number/*int*/, strValue:string) {
/**
* NOTE: To ensure that this constructor creates the same hidden class each time, ensure that
* all the fields are assigned to in the exact same order in each run of this constructor.
*/
this.index = index;
this.type = type;
this._intValue = intValue;
this._strValue = strValue;
}
isCharacter(code:number/*int*/):boolean {
return (this.type == TOKEN_TYPE_CHARACTER && this._intValue == code);
}
isNumber():boolean {
return (this.type == TOKEN_TYPE_NUMBER);
}
isString():boolean {
return (this.type == TOKEN_TYPE_STRING);
}
isOperator(operater:string):boolean {
return (this.type == TOKEN_TYPE_OPERATOR && this._strValue == operater);
}
isIdentifier():boolean {
return (this.type == TOKEN_TYPE_IDENTIFIER);
}
isKeyword():boolean {
return (this.type == TOKEN_TYPE_KEYWORD);
}
isKeywordNull():boolean {
return (this.type == TOKEN_TYPE_KEYWORD && this._strValue == "null");
}
isKeywordUndefined():boolean {
return (this.type == TOKEN_TYPE_KEYWORD && this._strValue == "undefined");
}
isKeywordTrue():boolean {
return (this.type == TOKEN_TYPE_KEYWORD && this._strValue == "true");
}
isKeywordFalse():boolean {
return (this.type == TOKEN_TYPE_KEYWORD && this._strValue == "false");
}
toNumber():number/*int*/ {
// -1 instead of NULL ok?
return (this.type == TOKEN_TYPE_NUMBER) ? this._intValue : -1;
}
toString():string {
var type:number/*int*/ = this.type;
if (type >= TOKEN_TYPE_CHARACTER && type <= TOKEN_TYPE_STRING) {
return this._strValue;
} else if (type == TOKEN_TYPE_NUMBER) {
return this._intValue.toString();
} else {
return null;
}
}
}
function newCharacterToken(index:number/*int*/, code:number/*int*/):Token {
return new Token(index, TOKEN_TYPE_CHARACTER, code, StringWrapper.fromCharCode(code));
}
function newIdentifierToken(index:number/*int*/, text:string):Token {
return new Token(index, TOKEN_TYPE_IDENTIFIER, 0, text);
}
function newKeywordToken(index:number/*int*/, text:string):Token {
return new Token(index, TOKEN_TYPE_KEYWORD, 0, text);
}
function newOperatorToken(index:number/*int*/, text:string):Token {
return new Token(index, TOKEN_TYPE_OPERATOR, 0, text);
}
function newStringToken(index:number/*int*/, text:string):Token {
return new Token(index, TOKEN_TYPE_STRING, 0, text);
}
function newNumberToken(index:number/*int*/, n:number/*int*/):Token {
return new Token(index, TOKEN_TYPE_NUMBER, n, "");
}
var EOF:Token = new Token(-1, 0, 0, "");
// TODO(chirayu): Rewrite as consts when possible.
var $EOF = 0;
var $TAB = 9;
var $LF = 10;
var $VTAB = 11;
var $FF = 12;
var $CR = 13;
var $SPACE = 32;
var $BANG = 33;
var $DQ = 34;
var $$ = 36;
var $PERCENT = 37;
var $AMPERSAND = 38;
var $SQ = 39;
var $LPAREN = 40;
var $RPAREN = 41;
var $STAR = 42;
var $PLUS = 43;
var $COMMA = 44;
var $MINUS = 45;
var $PERIOD = 46;
var $SLASH = 47;
var $COLON = 58;
var $SEMICOLON = 59;
var $LT = 60;
var $EQ = 61;
var $GT = 62;
var $QUESTION = 63;
var $0 = 48;
var $9 = 57;
var $A = 65, $B = 66, $C = 67, $D = 68, $E = 69, $F = 70, $G = 71, $H = 72,
$I = 73, $J = 74, $K = 75, $L = 76, $M = 77, $N = 78, $O = 79, $P = 80,
$Q = 81, $R = 82, $S = 83, $T = 84, $U = 85, $V = 86, $W = 87, $X = 88,
$Y = 89, $Z = 90;
var $LBRACKET = 91;
var $BACKSLASH = 92;
var $RBRACKET = 93;
var $CARET = 94;
var $_ = 95;
var $a = 97, $b = 98, $c = 99, $d = 100, $e = 101, $f = 102, $g = 103,
$h = 104, $i = 105, $j = 106, $k = 107, $l = 108, $m = 109, $n = 110,
$o = 111, $p = 112, $q = 113, $r = 114, $s = 115, $t = 116, $u = 117,
$v = 118, $w = 119, $x = 120, $y = 121, $z = 122;
var $LBRACE = 123;
var $BAR = 124;
var $RBRACE = 125;
var $TILDE = 126;
var $NBSP = 160;
export class Scanner {
@FIELD('final input:String')
@FIELD('final length:int')
@FIELD('peek:int')
@FIELD('index:int')
constructor(input:string) {
this.input = input;
this.length = input.length;
this.peek = 0;
this.index = -1;
this.advance();
}
advance() {
this.peek = ++this.index >= this.length ? $EOF : StringWrapper.charCodeAt(this.input, this.index);
}
scanToken():Token {
var input = this.input,
length = this.length,
peek = this.peek,
index = this.index;
// Skip whitespace.
while (peek <= $SPACE) {
if (++index >= length) {
peek = $EOF;
break;
} else {
peek = StringWrapper.charCodeAt(input, index);
}
}
this.peek = peek;
this.index = index;
if (index >= length) {
return null;
}
// Handle identifiers and numbers.
if (isIdentifierStart(peek)) return this.scanIdentifier();
if (isDigit(peek)) return this.scanNumber(index);
var start:number/*int*/ = index;
switch (peek) {
case $PERIOD:
this.advance();
return isDigit(peek) ? scanNumber(start) :
newCharacterToken(start, $PERIOD);
case $LPAREN: case $RPAREN:
case $LBRACE: case $RBRACE:
case $LBRACKET: case $RBRACKET:
case $COMMA:
case $COLON:
case $SEMICOLON:
return this.scanCharacter(start, peek);
case $SQ:
case $DQ:
return this.scanString();
case $PLUS:
case $MINUS:
case $STAR:
case $SLASH:
case $PERCENT:
case $CARET:
case $QUESTION:
return this.scanOperator(start, StringWrapper.fromCharCode(peek));
case $LT:
case $GT:
case $BANG:
case $EQ:
return this.scanComplexOperator(start, $EQ, StringWrapper.fromCharCode(peek), '=');
case $AMPERSAND:
return this.scanComplexOperator(start, $AMPERSAND, '&', '&');
case $BAR:
return this.scanComplexOperator(start, $BAR, '|', '|');
case $TILDE:
return this.scanComplexOperator(start, $SLASH, '~', '/');
case $NBSP:
while (isWhitespace(this.peek)) this.advance();
return this.scanToken();
}
this.error('Unexpected character [$' + StringWrapper.fromCharCode(peek) + ']');
return null;
}
scanCharacter(start:number/*int*/, code:number/*int*/):Token {
assert(this.peek == code);
this.advance();
return newCharacterToken(start, code);
}
scanOperator(start:number/*int*/, str:string):Token {
assert(this.peek == StringWrapper.charCodeAt(str, 0));
assert(SetWrapper.has(OPERATORS, str));
this.advance();
return newOperatorToken(start, str);
}
scanComplexOperator(start:number/*int*/, code:number/*int*/, one:string, two:string):Token {
assert(this.peek == StringWrapper.charCodeAt(one, 0));
this.advance();
var str:string = one;
if (this.peek == code) {
this.advance();
str += two;
}
assert(SetWrapper.has(OPERATORS, str));
return newOperatorToken(start, str);
}
scanIdentifier():Token {
assert(isIdentifierStart(this.peek));
var start:number/*int*/ = this.index;
this.advance();
while (isIdentifierPart(this.peek)) this.advance();
var str:string = this.input.substring(start, this.index);
if (SetWrapper.has(KEYWORDS, str)) {
return newKeywordToken(start, str);
} else {
return newIdentifierToken(start, str);
}
}
scanNumber(start:number/*int*/):Token {
assert(isDigit(this.peek));
var simple:boolean = (this.index === start);
this.advance(); // Skip initial digit.
while (true) {
if (isDigit(this.peek)) {
// Do nothing.
} else if (this.peek == $PERIOD) {
simple = false;
} else if (isExponentStart(this.peek)) {
this.advance();
if (isExponentSign(this.peek)) this.advance();
if (!isDigit(this.peek)) this.error('Invalid exponent');
simple = false;
} else {
break;
}
this.advance();
}
var str:string = this.input.substring(start, this.index);
// TODO
var value:number = simple ? NumberWrapper.parseIntAutoRadix(str) : NumberWrapper.parseDouble(str);
return newNumberToken(start, value);
}
scanString():Token {
assert(this.peek == $SQ || this.peek == $DQ);
var start:number/*int*/ = this.index;
var quote:number/*int*/ = this.peek;
this.advance(); // Skip initial quote.
var buffer:StringJoiner; //ckck
var marker:number/*int*/ = this.index;
var input:string = this.input;
while (this.peek != quote) {
if (this.peek == $BACKSLASH) {
if (buffer == null) buffer = new StringJoiner();
buffer.add(input.substring(marker, this.index));
this.advance();
var unescapedCode:number/*int*/;
if (this.peek == $u) {
// 4 character hex code for unicode character.
var hex:string = input.substring(this.index + 1, this.index + 5);
unescapedCode = NumberWrapper.parseInt(hex, 16);
for (var i:number/*int*/ = 0; i < 5; i++) {
this.advance();
}
} else {
unescapedCode = unescape(this.peek);
this.advance();
}
buffer.add(StringWrapper.fromCharCode(unescapedCode));
marker = this.index;
} else if (this.peek == $EOF) {
this.error('Unterminated quote');
} else {
this.advance();
}
}
var last:string = input.substring(marker, this.index);
this.advance(); // Skip terminating quote.
var str:string = input.substring(start, this.index);
// Compute the unescaped string value.
var unescaped:string = last;
if (buffer != null) {
buffer.add(last);
unescaped = buffer.toString();
}
return newStringToken(start, unescaped);
}
error(message:string) {
var position:number/*int*/ = this.index + this.offset;
throw `Lexer Error: ${message} at column ${position} in expression [${input}]`;
}
}
function isWhitespace(code:number/*int*/):boolean {
return (code >= $TAB && code <= $SPACE) || (code == $NBSP);
}
function isIdentifierStart(code:number/*int*/):boolean {
return ($a <= code && code <= $z) ||
($A <= code && code <= $Z) ||
(code == $_) ||
(code == $$);
}
function isIdentifierPart(code:number/*int*/):boolean {
return ($a <= code && code <= $z) ||
($A <= code && code <= $Z) ||
($0 <= code && code <= $9) ||
(code == $_) ||
(code == $$);
}
function isDigit(code:number/*int*/):boolean {
return $0 <= code && code <= $9;
}
function isExponentStart(code:number/*int*/):boolean {
return code == $e || code == $E;
}
function isExponentSign(code:number/*int*/):boolean {
return code == $MINUS || code == $PLUS;
}
function unescape(code:number/*int*/):number/*int*/ {
switch(code) {
case $n: return $LF;
case $f: return $FF;
case $r: return $CR;
case $t: return $TAB;
case $v: return $VTAB;
default: return code;
}
}
var OPERATORS = SetWrapper.createFromList([
'+',
'-',
'*',
'/',
'~/',
'%',
'^',
'=',
'==',
'!=',
'<',
'>',
'<=',
'>=',
'&&',
'||',
'&',
'|',
'!',
'?'
]);
var KEYWORDS = SetWrapper.createFromList([
'null',
'undefined',
'true',
'false',
]);
export function Lexer(text:string):List {
var scanner:Scanner = new Scanner(text);
var tokens:List<Token> = [];
var token:Token = scanner.scanToken();
while (token != null) {
ListWrapper.push(tokens, token);
token = scanner.scanToken();
}
return tokens;
}

View File

@ -0,0 +1,2 @@
export class Parser {
}

View File

@ -0,0 +1,98 @@
import {describe, it, expect} from 'test_lib/test_lib';
import {Lexer, Scanner, Token} from 'change_detection/parser/lexer';
import {DOM} from 'facade/dom';
import {List, ListWrapper} from "facade/collection";
import {StringWrapper} from "facade/lang";
function expectToken(token, index) {
expect(token instanceof Token).toBe(true);
expect(token.index).toEqual(index);
}
function expectCharacterToken(token, index, character) {
expect(character.length).toBe(1);
expectToken(token, index);
expect(token.isCharacter(StringWrapper.charCodeAt(character, 0))).toBe(true);
}
function expectOperatorToken(token, index, operator) {
expectToken(token, index);
expect(token.isOperator(operator)).toBe(true);
}
function expectNumberToken(token, index, n) {
expectToken(token, index);
expect(token.isNumber()).toBe(true);
expect(token.toNumber()).toBe(n);
}
function expectStringToken(token, index, str) {
expectToken(token, index);
expect(token.isString()).toBe(true);
expect(token.toString()).toBe(str);
}
function expectIdentifierToken(token, index, identifier) {
expectToken(token, index);
expect(token.isIdentifier()).toBe(true);
expect(token.toString()).toEqual(identifier);
}
function expectKeywordToken(token, index, keyword) {
expectToken(token, index);
expect(token.isKeyword()).toBe(true);
expect(token.toString()).toEqual(keyword);
}
export function main() {
describe('lexer', function() {
describe('token', function() {
it('should tokenize a simple identifier', function() {
var tokens:List<int> = Lexer("j");
expect(tokens.length).toEqual(1);
expectIdentifierToken(tokens[0], 0, 'j');
});
it('should tokenize a dotted identifier', function() {
var tokens:List<int> = Lexer("j.k");
expect(tokens.length).toEqual(3);
expectIdentifierToken(tokens[0], 0, 'j');
expectCharacterToken (tokens[1], 1, '.');
expectIdentifierToken(tokens[2], 2, 'k');
});
it('should tokenize an operator', function() {
var tokens:List<int> = Lexer("j-k");
expect(tokens.length).toEqual(3);
expectOperatorToken(tokens[1], 1, '-');
});
it('should tokenize an indexed operator', function() {
var tokens:List<int> = Lexer("j[k]");
expect(tokens.length).toEqual(4);
expectCharacterToken(tokens[1], 1, "[");
expectCharacterToken(tokens[3], 3, "]");
});
it('should tokenize numbers', function() {
var tokens:List<int> = Lexer("88");
expect(tokens.length).toEqual(1);
expectNumberToken(tokens[0], 0, 88);
});
it('should tokenize numbers within index ops', function() {
expectNumberToken(Lexer("a[22]")[2], 2, 22);
});
it('should tokenize simple quoted strings', function() {
expectStringToken(Lexer('"a"')[0], 0, "a");
});
it('should tokenize quoted strings with escaped quotes', function() {
expectStringToken(Lexer('"a\\""')[0], 0, 'a"');
});
});
});
}

View File

@ -17,4 +17,10 @@ class ListWrapper {
static get(m, k) => m[k];
static void set(m, k, v) { m[k] = v; }
static contains(m, k) => m.containsKey(k);
static void push(List l, e) { l.add(e); }
}
class SetWrapper {
static Set createFromList(List l) { return new Set.from(l); }
static bool has(Set s, key) { return s.contains(key); }
}

View File

@ -15,4 +15,10 @@ export class ListWrapper {
static clone(array) {
return Array.prototype.slice.call(array, 0);
}
static push(l, e) { l.push(e); }
}
export class SetWrapper {
static createFromList(lst:List) { return new Set(lst); }
static has(s:Set, key):boolean { return s.has(key); }
}

View File

@ -10,3 +10,39 @@ class FIELD {
class CONST {}
class ABSTRACT {}
class IMPLEMENTS {}
class StringWrapper {
static String fromCharCode(int code) {
return new String.fromCharCode(code);
}
static charCodeAt(String s, int index) {
return s.codeUnitAt(index);
}
}
class StringJoiner {
List<String> _parts = <String>[];
void add(String part) {
_parts.add(part);
}
String toString() => _parts.join("");
}
class NumberWrapper {
static int parseIntAutoRadix(String text) {
return int.parse(text);
}
static int parseInt(String text, int radix) {
return int.parse(text, radix: radix);
}
static double parseFloat(String text) {
return double.parse(text);
}
}

View File

@ -10,3 +10,51 @@ export class FIELD {
export class CONST {}
export class ABSTRACT {}
export class IMPLEMENTS {}
export class StringWrapper {
static fromCharCode(code:number/*int*/) {
return String.fromCharCode(code);
}
static charCodeAt(s:string, index:number/*int*/) {
return s.charCodeAt(index);
}
}
export class StringJoiner {
constructor() {
this.parts = [];
}
add(part:string) {
this.parts.push(part);
}
toString():string {
return this.parts.join("");
}
}
export class NumberWrapper {
static parseIntAutoRadix(text:string):number/*int*/ {
var result:number/*int*/ = parseInt(text);
if (isNaN(result)) {
throw new Error("Invalid integer literal when parsing " + text);
}
return result;
}
static parseInt(text:string, radix:number/*int*/):number/*int*/ {
var result:number/*int*/ = parseInt(text, radix);
if (isNaN(result)) {
throw new Error("Invalid integer literal when parsing " + text + " in base " + radix);
}
return result;
}
// TODO: NaN is a valid literal but is returned by parseFloat to indicate an error.
static parseFloat(text:string):number/*int*/ {
return parseFloat(text);
}
}