diff options
Diffstat (limited to 'src/llex.js')
-rw-r--r-- | src/llex.js | 596 |
1 files changed, 596 insertions, 0 deletions
diff --git a/src/llex.js b/src/llex.js new file mode 100644 index 0000000..4c85f65 --- /dev/null +++ b/src/llex.js @@ -0,0 +1,596 @@ +/* jshint esversion: 6 */ +"use strict"; + +const assert = require('assert'); + +const lapi = require('./lapi.js'); +const ldebug = require('./ldebug.js'); +const ldo = require('./ldo.js'); +const lua = require('./lua.js'); +const lobject = require('./lobject'); +const ljstype = require('./ljstype'); +const TValue = lobject.TValue; +const CT = lua.constant_types; +const TS = lua.thread_status; + +const FIRST_RESERVED = 257; + +const RESERVED = { + /* terminal symbols denoted by reserved words */ + TK_AND: FIRST_RESERVED, + TK_BREAK: FIRST_RESERVED + 1, + TK_DO: FIRST_RESERVED + 2, + TK_ELSE: FIRST_RESERVED + 3, + TK_ELSEIF: FIRST_RESERVED + 4, + TK_END: FIRST_RESERVED + 5, + TK_FALSE: FIRST_RESERVED + 6, + TK_FOR: FIRST_RESERVED + 8, + TK_FUNCTION: FIRST_RESERVED + 10, + TK_GOTO: FIRST_RESERVED + 11, + TK_IF: FIRST_RESERVED + 12, + TK_IN: FIRST_RESERVED + 13, + TK_LOCAL: FIRST_RESERVED + 14, + TK_NIL: FIRST_RESERVED + 15, + TK_NOT: FIRST_RESERVED + 16, + TK_OR: FIRST_RESERVED + 17, + TK_REPEAT: FIRST_RESERVED + 18, + TK_RETURN: FIRST_RESERVED + 19, + TK_THEN: FIRST_RESERVED + 20, + TK_TRUE: FIRST_RESERVED + 21, + TK_UNTIL: FIRST_RESERVED + 22, + TK_WHILE: FIRST_RESERVED + 23, + /* other terminal symbols */ + TK_IDIV: FIRST_RESERVED + 24, + TK_CONCAT: FIRST_RESERVED + 25, + TK_DOTS: FIRST_RESERVED + 26, + TK_EQ: FIRST_RESERVED + 27, + TK_GE: FIRST_RESERVED + 28, + TK_LE: FIRST_RESERVED + 29, + TK_NE: FIRST_RESERVED + 30, + TK_SHL: FIRST_RESERVED + 31, + TK_SHR: FIRST_RESERVED + 32, + TK_DBCOLON: FIRST_RESERVED + 33, + TK_EOS: FIRST_RESERVED + 34, + TK_FLT: FIRST_RESERVED + 35, + TK_INT: FIRST_RESERVED + 36, + TK_NAME: FIRST_RESERVED + 37, + TK_STRING: FIRST_RESERVED + 38 +}; + +const R = RESERVED; + +const reserved_keywords = [ + "and", "break", "do", "else", "elseif", + "end", "false", "for", "function", "goto", "if", + "in", "local", "nil", "not", "or", "repeat", + "return", "then", "true", "until", "while" +]; + +const reserved_keywords_tokens = [ + R.TK_AND, + R.TK_BREAK, + R.TK_DO, + R.TK_ELSE, + R.TK_ELSEIF, + R.TK_END, + R.TK_FALSE, + R.TK_FOR, + R.TK_FUNCTION, + R.TK_GOTO, + R.TK_IF, + R.TK_IN, + R.TK_LOCAL, + R.TK_NIL, + R.TK_NOT, + R.TK_OR, + R.TK_REPEAT, + R.TK_RETURN, + R.TK_THEN, + R.TK_TRUE, + R.TK_UNTIL, + R.TK_WHILE, +]; + +const luaX_tokens = [ + "and", "break", "do", "else", "elseif", + "end", "false", "for", "function", "goto", "if", + "in", "local", "nil", "not", "or", "repeat", + "return", "then", "true", "until", "while", + "//", "..", "...", "==", ">=", "<=", "~=", + "<<", ">>", "::", "<eof>", + "<number>", "<integer>", "<name>", "<string>" +]; + +const NUM_RESERVED = Object.keys(RESERVED).length; + +class Buffer { + constructor(string) { + this.buffer = string ? string.split('') : []; + this.n = this.buffer.length; + this.off = 0; + } +} + +class SemInfo { + constructor() { + this.r = NaN; + this.i = NaN; + this.ts = null; + } +} + +class Token { + constructor() { + this.token = NaN; + this.seminfo = null; + } +} + +/* state of the lexer plus state of the parser when shared by all + functions */ +class LexState { + constructor() { + this.current = NaN; /* current character (charint) */ + this.linenumber = NaN; /* input line counter */ + this.lastline = NaN; /* line of last token 'consumed' */ + this.t = null; /* current token */ + this.lookahead = null; /* look ahead token */ + this.fs = null; /* current function (parser) */ + this.L = null; + this.z = new Buffer(); + this.buff = new Buffer(); /* buffer for tokens */ + this.h = null; /* to avoid collection/reuse strings */ + this.dyd = null; /* dynamic structures used by the parser */ + this.source = null; /* current source name */ + this.envn = null; /* environment variable name */ + } +} + +const save = function(ls, c) { + let b = ls.buff; + if (b.n + 1 > b.buffer.length) { + if (b.buffer.length >= Number.MAX_SAFE_INTEGER/2) + lexerror(ls, "lexical element too long", 0); + } + b.buffer[b.n++] = c; +}; + +const luaX_token2str = function(ls, token) { + if (token < FIRST_RESERVED) { /* single-byte symbols? */ + return lapi.lua_pushstring(ls.L, `'%{String.fromCharCode(token)}'`); + } else { + let s = luaX_tokens[token - FIRST_RESERVED]; + if (token < R.TK_EOS) /* fixed format (symbols and reserved words)? */ + return lapi.lua_pushstring(ls.L, `'${s}'`); + else /* names, strings, and numerals */ + return s; + } +}; + +const currIsNewline = function(ls) { + return ls.current === '\n' || ls.current === '\r'; +}; + +const next = function(ls) { + ls.current = ls.z.n-- > 0 ? ls.z.buffer[ls.z.off++] : -1; +}; + +const save_and_next = function(ls) { + save(ls, ls.current); + next(ls); +}; + +/* +** increment line number and skips newline sequence (any of +** \n, \r, \n\r, or \r\n) +*/ +const inclinenumber = function(ls) { + let old = ls.current; + assert(currIsNewline(ls)); + next(ls); /* skip '\n' or '\r' */ + if (currIsNewline(ls) && ls.current !== old) + next(ls); /* skip '\n\r' or '\r\n' */ + if (++ls.linenumber >= Number.MAX_SAFE_INTEGER) + lexerror(ls, "chunk has too many lines", 0); +}; + +const luaX_setinput = function(L, ls, z, source, firstchar) { + ls.t.token = 0; + ls.L = L; + ls.current = firstchar; + ls.lookahead.token = R.TK_EOS; + ls.z = z; + ls.fs = null; + ls.linenumber = 1; + ls.lastline = 1; + ls.source = source; + ls.envn = new TValue(CT.LUA_TLNGSTR, "_ENV"); +}; + +const check_next1 = function(ls, c) { + if (ls.current === c) { + next(ls); + return true; + } + + return false; +}; + +/* +** Check whether current char is in set 'set' (with two chars) and +** saves it +*/ +const check_next2 = function(ls, set) { + if (ls.current === set.charAt(0) || ls.current === set.charAt(1)) { + save_and_next(ls); + return true; + } + + return false; +}; + +const read_numeral = function(ls, seminfo) { + let expo = "Ee"; + let first = ls.current; + assert(ljstype.lisdigit(ls.current)); + save_and_next(ls); + if (first === '0' && check_next2(ls, "xX")) /* hexadecimal? */ + expo = "Pp"; + + for (;;) { + if (check_next2(ls, expo)) /* exponent part? */ + check_next2(ls, "-+"); /* optional exponent sign */ + if (ljstype.lisxdigit(ls.current)) + save_and_next(ls); + else if (ls.current === '.') + save_and_next(ls); + else break; + } + + save(ls, '\0'); + + let obj = lobject.luaO_str2num(ls.buff.buffer); + if (obj === false) /* format error? */ + lexerror(ls, "malformed number", R.TK_FLT); + if (obj.ttisinteger()) { + seminfo.i = obj.value; + return R.TK_INT; + } else { + assert(obj.ttisfloat()); + seminfo.r = obj.value; + return R.TK_FLT; + } +}; + +const txtToken = function(ls, token) { + switch (token) { + case R.TK_NAME: case R.TK_STRING: + case R.TK_FLT: case R.TK_INT: + save(ls, '\0'); + return lapi.lua_pushstring(ls.L, `'${ls.buff.buffer}'`); + default: + return luaX_token2str(ls, token); + } +}; + +const lexerror = function(ls, msg, token) { + msg = ldebug.luaG_addinfo(ls.L, msg, ls.source, ls.linenumber); + if (token) + lapi.lua_pushstring(ls.L, `${msg} near ${txtToken(ls, token)}`); + ldo.luaD_throw(ls.L, TS.LUA_ERRSYNTAX); +}; + +/* +** skip a sequence '[=*[' or ']=*]'; if sequence is well formed, return +** its number of '='s; otherwise, return a negative number (-1 iff there +** are no '='s after initial bracket) +*/ +const skip_sep = function(ls) { + let count = 0; + let s = ls.current; + assert(s === '[' || s === ']'); + save_and_next(ls); + while (ls.current === '=') { + save_and_next(ls); + count++; + } + return ls.current === s ? count : (-count) - 1; +}; + +const read_long_string = function(ls, seminfo, sep) { + let line = ls.linenumber; /* initial line (for error message) */ + save_and_next(ls); /* skip 2nd '[' */ + + if (currIsNewline(ls)) /* string starts with a newline? */ + inclinenumber(ls); /* skip it */ + + let skip = false; + for (; !skip ;) { + switch (ls.current) { + case -1: { /* error */ + let what = seminfo ? "string" : "comment"; + let msg = lapi.lua_pushstring(ls.L, `unfinished long ${what} (starting at line ${line})`); + lexerror(ls, msg, R.TK_EOS); + break; + } + case ']': { + if (skip_sep(ls) === sep) { + save_and_next(ls); /* skip 2nd ']' */ + skip = true; + } + break; + } + case '\n': case '\r': { + save(ls, '\n'); + inclinenumber(ls); + if (!seminfo) ls.buff.n = 0; + break; + } + default: { + if (seminfo) save_and_next(ls); + else next(ls); + } + } + } + + if (seminfo) + seminfo.ts = new TValue(CT.LUA_TLNGSTR, ls.buff.buffer.slice(2 + sep).join('')); +}; + +const esccheck = function(ls, c, msg) { + if (!c) { + if (ls.current !== -1) + save_and_next(ls); /* add current to buffer for error message */ + lexerror(ls, msg, R.TK_STRING); + } +}; + +const gethexa = function(ls) { + save_and_next(ls); + esccheck(ls, ljstype.lisxdigit(ls.current), "hexadecimal digit expected"); + return lobject.luaO_hexavalue(ls.current); +}; + +const readhexaesc = function(ls) { + let r = gethexa(ls); + r = (r << 4) + gethexa(ls); + ls.buff.n -= 2; /* remove saved chars from buffer */ + return r; +}; + +const readutf8desc = function(ls) { + let i = 4; /* chars to be removed: '\', 'u', '{', and first digit */ + save_and_next(ls); /* skip 'u' */ + esccheck(ls, ls.current === '{', "missing '{'"); + let r = gethexa(ls); /* must have at least one digit */ + + save_and_next(ls); + while (ljstype.lisxdigit(ls.current)) { + i++; + r = (r << 4) + lobject.luaO_hexavalue(ls.current); + esccheck(ls, r <= 0x10FFFF, "UTF-8 value too large"); + save_and_next(ls); + } + esccheck(ls, ls.current === '}', "missing '}'"); + next(ls); /* skip '}' */ + ls.buff.n -= i; /* remove saved chars from buffer */ + return r; +}; + +const utf8esc = function(ls) { + let buff = new Array(lobject.UTF8BUFFSZ); + let n = lobject.luaO_utf8esc(buff, readutf8desc(ls)); + for (; n > 0; n--) /* add 'buff' to string */ + save(ls, buff[lobject.UTF8BUFFSZ - n]); +}; + +const readdecesc = function(ls) { + let r = 0; /* result accumulator */ + let i; + for (i = 0; i < 3 && ljstype.lisdigit(ls.current); i++) { /* read up to 3 digits */ + r = 10 * r + parseInt(ls.current); + save_and_next(ls); + } + esccheck(ls, r <= 255, "decimal escape too large"); + ls.buff.n -= i; /* remove read digits from buffer */ + return r; +}; + +const read_string = function(ls, del, seminfo) { + save_and_next(ls); /* keep delimiter (for error messages) */ + + while (ls.current !== del) { + switch (ls.current) { + case -1: + lexerror(ls, "unfinished string", R.TK_EOS); + break; + case '\n': + case '\r': + lexerror(ls, "unfinished string", R.TK_STRING); + break; + case '\\': { /* escape sequences */ + save_and_next(ls); /* keep '\\' for error messages */ + let will; + let c; + switch(ls.current) { + case 'a': c = '\a'; will = 'read_save'; break; + case 'b': c = '\b'; will = 'read_save'; break; + case 'f': c = '\f'; will = 'read_save'; break; + case 'n': c = '\n'; will = 'read_save'; break; + case 'r': c = '\r'; will = 'read_save'; break; + case 't': c = '\t'; will = 'read_save'; break; + case 'v': c = '\v'; will = 'read_save'; break; + case 'x': c = readhexaesc(ls); will = 'read_save'; break; + case 'u': utf8esc(ls); will = 'read_save'; break; + case '\n': case '\r': + inclinenumber(ls); c = '\n'; will = 'read_save'; break; + case '\\': case '\"': case '\'': + c = ls.current; will = 'read_save'; break; + case -1: will = 'read_save'; break; /* will raise an error next loop */ + case 'z': { /* zap following span of spaces */ + ls.buff.n -= 1; /* remove '\\' */ + next(ls); /* skip the 'z' */ + while (ljstype.lisspace(ls.current)) { + if (currIsNewline(ls)) inclinenumber(ls); + else next(ls); + } + will = 'no_save'; break; + } + default: { + esccheck(ls, ljstype.lisdigit(ls.current), "invalid escape sequence"); + c = readdecesc(ls); /* digital escape '\ddd' */ + will = 'only_save'; break; + } + } + + if (will === 'read_save') + next(ls); + else if (will === 'only_save') { + ls.buff.n -= 1; /* remove '\\' */ + save(ls, c); + } else if (will === 'no_save') + break; + } + default: + save_and_next(ls); + } + } + save_and_next(ls); /* skip delimiter */ + seminfo.ts = new TValue(CT.LUA_TLNGSTR, ls.buff.buffer.slice(1).join('')); +}; + +const llex = function(ls, seminfo) { + ls.buff.n = 0; + + for (;;) { + switch (ls.current) { + case '\n': case '\r': { /* line breaks */ + inclinenumber(ls); + break; + } + case ' ': case '\f': case '\t': case '\v': { /* spaces */ + next(ls); + break; + } + case '-': { /* '-' or '--' (comment) */ + next(ls); + if (ls.current !== '-') return '-'; + /* else is a comment */ + next(ls); + if (ls.current === '[') { /* long comment? */ + let sep = skip_sep(ls); + ls.buff.n = 0; /* 'skip_sep' may dirty the buffer */ + if (sep >= 0) { + read_long_string(ls, null, sep); /* skip long comment */ + ls.buff.n = 0; /* previous call may dirty the buff. */ + break; + } + } + + /* else short comment */ + while (!currIsNewline(ls) && ls.current !== -1) + next(ls); /* skip until end of line (or end of file) */ + break; + } + case '[': { /* long string or simply '[' */ + let sep = skip_sep(ls); + if (sep.charCodeAt(0) >= 0) { + read_long_string(ls, seminfo, sep); + return R.TK_STRING; + } else if (sep !== -1) /* '[=...' missing second bracket */ + lexerror(ls, "invalid long string delimiter", R.TK_STRING); + return '['; + } + case '=': { + next(ls); + if (check_next1(ls, '=')) return R.TK_EQ; + else return '='; + } + case '<': { + next(ls); + if (check_next1(ls, '=')) return R.TK_LE; + else if (check_next1(ls, '<')) return R.TK_SHL; + else return '<'; + } + case '>': { + next(ls); + if (check_next1(ls, '=')) return R.TK_GE; + else if (check_next1(ls, '>')) return R.TK_SHR; + else return '>'; + } + case '/': { + next(ls); + if (check_next1(ls, '/')) return R.TK_IDIV; + else return '/'; + } + case '~': { + next(ls); + if (check_next1(ls, '=')) return R.TK_NE; + else return '~'; + } + case ':': { + next(ls); + if (check_next1(ls, ':')) return R.TK_DBCOLON; + else return ':'; + } + case '"': case '\'': { /* short literal strings */ + read_string(ls, ls.current, seminfo); + return R.TK_STRING; + } + case '.': { /* '.', '..', '...', or number */ + save_and_next(ls); + if (check_next1(ls, '.')) { + if (check_next1(ls, '.')) + return R.TK_DOTS; /* '...' */ + else return R.TK_CONCAT; /* '..' */ + } + else if (!ljstype.lisdigit(ls.current)) return '.'; + else return read_numeral(ls, seminfo); + } + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': { + return read_numeral(ls, seminfo); + } + case -1: { + return R.TK_EOS; + } + default: { + if (ljstype.lislalpha(ls.current)) { /* identifier or reserved word? */ + do { + save_and_next(ls); + } while (ljstype.lislalnum(ls.current)); + + let ts = new TValue(CT.LUA_TLNGSTR, ls.buff.buffer.join('')); + seminfo.ts = ts; + if (reserved_keywords.indexOf(ts.value) >= 0) /* reserved word? */ + return reserved_keywords_tokens[reserved_keywords.indexOf(ts.value)]; + else + return R.TK_NAME; + } else { /* single-char tokens (+ - / ...) */ + let c = ls.current; + next(ls); + return c; + } + } + } + } +}; + +const luaX_next = function(ls) { + ls.lastline = ls.linenumber; + if (ls.lookahead.token !== R.TK_EOS) { /* is there a look-ahead token? */ + ls.t = ls.lookahead; /* use this one */ + ls.lookahead.token = R.TK_EOS; /* and discharge it */ + } else + ls.t.token = llex(ls, ls.t.seminfo); /* read next token */ +}; + +const luaX_lookahead = function(ls) { + assert(ls.lookahead.token === R.TK_EOS); + ls.lookahead.token = llex(ls. ls.lookahead.seminfo); + return ls.lookahead.token; +}; + +module.exports.luaX_lookahead = luaX_lookahead; +module.exports.luaX_next = luaX_next; +module.exports.luaX_setinput = luaX_setinput;
\ No newline at end of file |