summaryrefslogtreecommitdiff
path: root/src/lstrlib.js
diff options
context:
space:
mode:
authorBenoit Giannangeli <benoit.giannangeli@boursorama.fr>2017-03-20 08:25:04 +0100
committerBenoit Giannangeli <benoit.giannangeli@boursorama.fr>2017-03-20 12:27:40 +0100
commit16612bb93644e83afde482b75725fe1ce902d02a (patch)
tree8610c2ed703994c00844fff83cdea37140e2ae9d /src/lstrlib.js
parent6d684333cbbf92ce940b0f10b126197bb2e04b93 (diff)
downloadfengari-16612bb93644e83afde482b75725fe1ce902d02a.tar.gz
fengari-16612bb93644e83afde482b75725fe1ce902d02a.tar.bz2
fengari-16612bb93644e83afde482b75725fe1ce902d02a.zip
string.match, string.find
Diffstat (limited to 'src/lstrlib.js')
-rw-r--r--src/lstrlib.js400
1 files changed, 399 insertions, 1 deletions
diff --git a/src/lstrlib.js b/src/lstrlib.js
index 56fcc11..64870d6 100644
--- a/src/lstrlib.js
+++ b/src/lstrlib.js
@@ -15,6 +15,12 @@ const L_ESC = '%'.charCodeAt(0);
// (sizeof(size_t) < sizeof(int) ? MAX_SIZET : (size_t)(INT_MAX))
const MAXSIZE = Number.MAX_SAFE_INTEGER;
+/*
+** maximum number of captures that a pattern can do during
+** pattern-matching. This limit is arbitrary, but must fit in
+** an unsigned char.
+*/
+const LUA_MAXCAPTURES = 32;
/* translate a relative string position: negative means back from end */
const posrelat = function(pos, len) {
@@ -165,9 +171,17 @@ const FLAGS = ["-", "+", " ", "#", "0"].map(e => e.charCodeAt(0));
*/
const MAX_FORMAT = 32;
+// TODO: locale ?
+const isalpha = e => /^[a-zA-Z]$/.test(e.charAt(0));
const isdigit = e => "0".charCodeAt(0) <= e && e <= "9".charCodeAt(0);
-
const iscntrl = e => (0x00 <= e && e <= 0x1f) || e === 0x7f;
+const isgraph = e => e.charCodeAt(0) > 32 && e.charCodeAt < 127; // TODO: Will only work for ASCII
+const islower = e => /^(?![A-Z]).*$/.test(e.charAt(0));
+const isupper = e => /^(?![a-z]).*$/.test(e.charAt(0));
+const isalnum = e => /^[a-zA-Z0-9]$/.test(e.charAt(0));
+const ispunct = e => isgraph(e) && !isalnum(e);
+const isspace = e => /^\s$/.test(e.charAt(0));
+const isxdigit = e => /^[0-9A-Fa-f]$/.test(e.charAt(0));
const addquoted = function(b, s) {
lauxlib.luaL_addchar(b, '"');
@@ -807,13 +821,397 @@ const str_unpack = function(L) {
return n + 1;
};
+const CAP_UNFINISHED = -1;
+const CAP_POSITION = -2;
+const MAXCCALLS = 200;
+const SPECIALS = ["^", "$", "*", "+", "?", ".", "(", "[", "%", "-"];
+
+class MatchState {
+ constructor(L) {
+ this.src = null; /* unmodified source string */
+ this.src_init = null; /* init of source string */
+ this.src_end = null; /* end ('\0') of source string */
+ this.p = null; /* unmodified pattern string */
+ this.p_end = null; /* end ('\0') of pattern */
+ this.L = L;
+ this.matchdepth = NaN; /* control for recursive depth */
+ this.level = NaN; /* total number of captures (finished or unfinished) */
+ this.captures = [];
+ }
+}
+
+const classend = function(ms, p) {
+ switch(ms.p.charAt(p++)) {
+ case L_ESC: {
+ if (p === ms.p_end)
+ lauxlib.luaL_error(ms.L, "malformed pattern (ends with '%')");
+ return p + 1;
+ }
+ case '[': {
+ if (ms.p.charAt(p) === '^') p++;
+ do { /* look for a ']' */
+ if (p === ms.p_end)
+ lauxlib.luaL_error(ms.L, "malformed pattern (missing ']')");
+ if (ms.p.charAt(p++) === L_ESC && p < ms.p_end)
+ p++; /* skip escapes (e.g. '%]') */
+ } while (ms.p.charAt(p) !== ']');
+ return p + 1;
+ }
+ default: {
+ return p;
+ }
+ }
+};
+
+const match_class = function(ms, c, cl) {
+ let res;
+ switch (cl.toLowerCase()) {
+ case 'a' : res = isalpha(c); break;
+ case 'c' : res = iscntrl(c); break;
+ case 'd' : res = isdigit(c); break;
+ case 'g' : res = isgraph(c); break;
+ case 'l' : res = islower(c); break;
+ case 'p' : res = ispunct(c); break;
+ case 's' : res = isspace(c); break;
+ case 'u' : res = isupper(c); break;
+ case 'w' : res = isalnum(c); break;
+ case 'x' : res = isxdigit(c); break;
+ case 'z' : res = (c.charCodeAt(0) === 0); break; /* deprecated option */
+ default: return (cl === c);
+ }
+ return (islower(cl) ? res : !res);
+};
+
+const matchbracketclass = function(ms, c, p, ec) {
+ let sig = true;
+ if (ms.p.charAt(p + 1) === '^') {
+ sig = false;
+ p++; /* skip the '^' */
+ }
+ while (++p < ec) {
+ if (ms.p.charAt(p) === L_ESC) {
+ p++;
+ if (match_class(c, ms.p.charAt(p)))
+ return sig;
+ } else if (ms.p.charAt(p + 1) === '-' && p + 2 < ec) {
+ p += 2;
+ if (ms.p.charAt(p - 2) <= c.charCodeAt(0) && c.charCodeAt(0) <= ms.p.charAt(p))
+ return sig;
+ } else if (ms.p.charAt(p) === c) return sig;
+ }
+ return !sig;
+};
+
+const singlematch = function(ms, s, p, ep) {
+ if (s >= ms.src_end)
+ return false;
+ else {
+ let c = ms.src.charAt(s);
+ switch (ms.p.charAt(p)) {
+ case '.': return true; /* matches any char */
+ case L_ESC: return match_class(c, ms.p.charAt(p + 1));
+ case '[': return matchbracketclass(ms, c, p, ep - 1);
+ default: return ms.p.charAt(p) === c;
+ }
+ }
+};
+
+const matchbalance = function(ms, s, p) {
+ if (p >= ms.p_end - 1)
+ lauxlib.luaL_error(ms.L, "malformed pattern (missing arguments to '%b'");
+ if (ms.src.charAt(s) !== ms.p.charAt(p))
+ return null;
+ else {
+ let b = ms.p.charAt(p);
+ let e = ms.p.charAt(p + 1);
+ let cont = 1;
+ while (++s < ms.src_end) {
+ if (ms.src.charAt(s) === e) {
+ if (--cont === 0) return s + 1;
+ }
+ else if (s === b) cont++;
+ }
+ }
+ return null; /* string ends out of balance */
+};
+
+const max_expand = function(ms, s, p, ep) {
+ let i = 0; /* counts maximum expand for item */
+ while (singlematch(ms, s + i, p, ep))
+ i++;
+ /* keeps trying to match with the maximum repetitions */
+ while (i >= 0) {
+ let res = match(ms, s + i, ep + 1);
+ if (res) return res;
+ i--; /* else didn't match; reduce 1 repetition to try again */
+ }
+ return null;
+};
+
+const min_expand = function(ms, s, p, ep) {
+ for (;;) {
+ let res = match(ms, s, ep + 1);
+ if (res !== null)
+ return res;
+ else if (singlematch(ms, s, p, ep))
+ s++; /* try with one more repetition */
+ else return null;
+ }
+};
+
+const start_capture = function(ms, s, p, what) {
+ let level = ms.level;
+ if (level >= LUA_MAXCAPTURES) lauxlib.luaL_error(ms.L, "too many captures");
+ ms.capture[level].init = s;
+ ms.capture[level].len = what;
+ ms.level = level + 1;
+ let res;
+ if ((res = match(ms, s, p)) === null) /* match failed? */
+ ms.level--; /* undo capture */
+ return res;
+};
+
+const end_capture = function(ms, s, p) {
+ let l = capture_to_close(ms);
+ ms.capture[l].len = s - ms.capture[l].init; /* close capture */
+ let res;
+ if ((res = match(ms, s, p)) === null) /* match failed? */
+ ms.capture[l].len = CAP_UNFINISHED; /* undo capture */
+ return res;
+};
+
+const match_capture = function(ms, s, l) {
+ l = check_capture(ms, l);
+ let len = ms.capture[l].len;
+ if (ms.src_end >= len && ms.src.slice(ms.capture[l].init, ms.capture[l].init + len) === ms.src.slice(s, s + len))
+ return s+len;
+ else return null;
+};
+
+const match = function(ms, s, p) {
+ let gotodefault = false;
+ let gotoinit = true;
+
+ if (ms.matchdepth-- === 0)
+ lauxlib.luaL_error(ms.L, "pattern too complex");
+
+ while (gotoinit || gotodefault) {
+ gotoinit = false;
+ if (p !== ms.p_end) { /* end of pattern? */
+ switch (gotodefault ? 'x' : ms.p.charAt(p)) {
+ case '(': { /* start capture */
+ if (ms.p.charAt(p + 1) === ')') /* position capture? */
+ s = start_capture(ms, s, 2, CAP_POSITION);
+ else
+ s = start_capture(ms, s, 1, CAP_UNFINISHED);
+ break;
+ }
+ case ')': { /* end capture */
+ s = end_capture(ms, s, 1);
+ break;
+ }
+ case '$': {
+ if (p + 1 !== ms.p_end) { /* is the '$' the last char in pattern? */
+ gotodefault = true; /* no; go to default */
+ break;
+ }
+ s = ms.src.slice(s).length === 0 ? s : null; /* check end of string */
+ break;
+ }
+ case L_ESC: { /* escaped sequences not in the format class[*+?-]? */
+ switch (ms.p.charAt(p + 1)) {
+ case 'b': { /* balanced string? */
+ s = matchbalance(ms, s, p + 2);
+ if (s !== null) {
+ p = p.slice(4);
+ gotoinit = true;
+ }
+ break;
+ }
+ case 'f': {
+ p += 2;
+ if (ms.p.charAt(p) !== '[')
+ lauxlib.luaL_error(ms.L, `missing '[' after '%f' in pattern`);
+ let ep = classend(ms, p); /* points to what is next */
+ let previous = s === ms.src_init ? '\0' : ms.s.charAt(s - 1);
+ if (!matchbracketclass(ms, previous, p, ep - 1) && matchbracketclass(ms, ms.src.charAt(s), p, ep - 1)) {
+ p = ep; gotoinit = true; break;
+ }
+ s = null; /* match failed */
+ break;
+ }
+ case '0': case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7':
+ case '8': case '9': { /* capture results (%0-%9)? */
+ s = match_capture(ms, s, ms.p.charAt(p + 1));
+ if (s !== null) {
+ p += 2; gotoinit = true;
+ }
+ break;
+ }
+ default: gotodefault = true;
+ }
+ break;
+ }
+ default: { /* pattern class plus optional suffix */
+ gotodefault = false;
+ let ep = classend(ms, p); /* points to optional suffix */
+ /* does not match at least once? */
+ if (!singlematch(ms, s, p, ep)) {
+ if (ep.charAt(0) === '*' || ep.charAt(0) === '?' || ep.charAt(0) === '-') { /* accept empty? */
+ p = ep + 1; gotoinit = true; break;
+ } else /* '+' or no suffix */
+ s = null; /* fail */
+ } else { /* matched once */
+ switch (ms.p.charAt(ep)) { /* handle optional suffix */
+ case '?': { /* optional */
+ let res;
+ if ((res = match(ms, s + 1, ep + 1)) !== null)
+ s = res;
+ else {
+ p = ep + 1; gotoinit = true;
+ }
+ break;
+ }
+ case '+': /* 1 or more repetitions */
+ s++; /* 1 match already done */
+ case '*': /* 0 or more repetitions */
+ s = max_expand(ms, s, p, ep);
+ break;
+ case '-': /* 0 or more repetitions (minimum) */
+ s = min_expand(ms, s, p, ep);
+ break;
+ default: /* no suffix */
+ s++; p = ep; gotoinit = true;
+ }
+ }
+ break;
+ }
+ }
+ }
+ }
+ ms.matchdepth++;
+ return s;
+};
+
+const push_onecatpure = function(ms, i, s, e) {
+ if (i >= ms.level) {
+ if (i === 0)
+ lapi.lua_pushlstring(ms.L, s, e); /* add whole match */
+ else
+ lauxlib.luaL_error(ms.L, `invalid capture index %${i + 1}`);
+ } else {
+ let l = ms.capture[i].len;
+ if (l === CAP_UNFINISHED) lauxlib.luaL_error(ms.L, "unfinished capture");
+ if (l === CAP_POSITION)
+ lapi.lua_pushinteger(ms.L, ms.src_init + 1);
+ else
+ lapi.lua_pushlstring(ms.L, ms.capture[i].init, l);
+ }
+};
+
+const push_captures = function(ms, s, e) {
+ let nlevels = ms.level === 0 && s ? 1 : ms.level;
+ lauxlib.luaL_checkstack(ms.L, nlevels, "too many catpures");
+ for (let i = 0; i < nlevels; i++)
+ push_onecatpure(ms, i, s, e);
+ return nlevels; /* number of strings pushed */
+};
+
+const nospecials = function(p, l) {
+ let upto = 0;
+ do {
+ let special = false;
+ let supto = p.slice(upto);
+ for (let i = 0; i < SPECIALS.length; i++) {
+ if (supto.indexOf(SPECIALS[i]) > -1) {
+ special = true;
+ break;
+ }
+ }
+
+ if (special)
+ return false; /* pattern has a special character */
+ upto = upto + 1; /* may have more after \0 */
+ } while (upto <= l);
+ return true; /* no special chars found */
+};
+
+const prepstate = function(ms, L, s, ls, p, lp) {
+ ms.L = L;
+ ms.matchdepth = MAXCCALLS;
+ ms.src = s;
+ ms.src_init = 0;
+ ms.src_end = ls;
+ ms.p = p;
+ ms.p_end = lp;
+};
+
+const reprepstate = function(ms) {
+ ms.level = 0;
+ assert(ms.matchdepth === MAXCCALLS);
+};
+
+const str_find_aux = function(L, find) {
+ let s = lauxlib.luaL_checkstring(L, 1);
+ let p = lauxlib.luaL_checkstring(L, 2);
+ let ls = s.length;
+ let lp = p.length;
+ let init = posrelat(lauxlib.luaL_optinteger(L, 3, 1), ls);
+ if (init < 1) init = 1;
+ else if (init > ls + 1) { /* start after string's end? */
+ lauxlib.lua_pushnil(L); /* cannot find anything */
+ return 1;
+ }
+ /* explicit request or no special characters? */
+ if (find && (lapi.lua_toboolean(L, 4) || nospecials(p, lp))) {
+ /* do a plain search */
+ let f = s.indexOf(p);
+ if (f > -1) {
+ lapi.lua_pushinteger(L, f);
+ lapi.lua_pushinteger(L, f + lp);
+ return 2;
+ }
+ } else {
+ let ms = new MatchState(L);
+ let s1 = init - 1;
+ let anchor = s.charAt(0) === '^';
+ if (anchor) {
+ p = p.slice(1); lp--; /* skip anchor character */
+ }
+ prepstate(ms, L, s, ls, p, lp);
+ do {
+ let res;
+ reprepstate(ms);
+ if ((res = match(ms, s1, 0)) !== null) {
+ if (find) {
+ lapi.lua_pushinteger(L, s1); /* start */
+ lapi.lua_pushinteger(L, res); /* end */
+ return push_captures(ms, null, 0) + 2;
+ } else
+ return push_captures(ms, s1, res);
+ }
+ } while (s1++ < ms.src_end && !anchor);
+ }
+};
+
+const str_find = function(L) {
+ return str_find_aux(L, 1);
+};
+
+const str_match = function(L) {
+ return str_find_aux(L, 1);
+};
+
const strlib = {
"byte": str_byte,
"char": str_char,
"dump": str_dump,
+ "find": str_find,
"format": str_format,
"len": str_len,
"lower": str_lower,
+ "match": str_match,
"pack": str_pack,
"packsize": str_packsize,
"rep": str_rep,