From 1a158cc176055a50df492dfa39c1b93e9e5b478a Mon Sep 17 00:00:00 2001 From: Benoit Giannangeli Date: Wed, 15 Mar 2017 08:42:55 +0100 Subject: utf8.len, utf8.codes --- src/lutf8lib.js | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 87 insertions(+), 2 deletions(-) (limited to 'src/lutf8lib.js') diff --git a/src/lutf8lib.js b/src/lutf8lib.js index 6ebbec8..a1ce6bb 100644 --- a/src/lutf8lib.js +++ b/src/lutf8lib.js @@ -27,6 +27,7 @@ const utf8_decode = function(s, val) { let limits = [0xFF, 0x7F, 0x7FF, 0xFFFF]; let c = s[0]; let res = 0; /* final result */ + let pos = 0; if (c < 0x80) /* ascii? */ res = c; else { @@ -42,14 +43,61 @@ const utf8_decode = function(s, val) { if (count > 3 || res > MAXUNICODE || res <= limits[count]) return null; /* invalid byte sequence */ s = s.slice(count); /* skip continuation bytes read */ + pos += count; } return { string: s.slice(1), /* +1 to include first byte */ - code: res + code: res, + pos: pos + 1 }; }; +/* +** utf8len(s [, i [, j]]) --> number of characters that start in the +** range [i,j], or nil + current position if 's' is not well formed in +** that interval +*/ +// const utflen = function(L) { +// let n = 0; +// let s = lauxlib.luaL_checkstring(L, 1); +// s = L.stack[lapi.index2addr_(L, 1)].value; +// let len = s.length; +// let posi = u_posrelat(lauxlib.luaL_optinteger(L, 2, 1), len); +// let posj = u_posrelat(lauxlib.luaL_optinteger(L, 3, -1), len); +// +// lauxlib.luaL_argcheck(L, 1 <= posi && --posi <= len, 2, "initial position out of string"); +// lauxlib.luaL_argcheck(L, --posj < len, 3, "final position out of string"); +// +// while (posi <= posj) { +// let dec = utf8_decode(s[posi]); +// let s1 = dec ? dec.string : null; +// if (s1 === null) { +// /* conversion error? */ +// lapi.lua_pushnil(L); /* return nil ... */ +// lapi.lua_pushinteger(L, posi + 1); /* ... and current position */ +// return 2; +// } +// posi = dec.pos; +// n++; +// } +// lapi.lua_pushinteger(L, n); +// return 1; +// }; + +// Shorter JSesque solution but doesn't take invalid UTF-8 sequence (but how can we get one ?) +const utflen = function(L) { + let s = lauxlib.luaL_checkstring(L, 1); + let posi = u_posrelat(lauxlib.luaL_optinteger(L, 2, 1), s.length); + let posj = u_posrelat(lauxlib.luaL_optinteger(L, 3, -1), s.length); + + lauxlib.luaL_argcheck(L, 1 <= posi && --posi <= s.length, 2, "initial position out of string"); + lauxlib.luaL_argcheck(L, --posj < s.length, 3, "final position out of string"); + + lapi.lua_pushinteger(L, s.slice(posi, posj + 1).length); + return 1; +}; + const pushutfchar = function(L, arg) { let code = lauxlib.luaL_checkinteger(L, arg); lauxlib.luaL_argcheck(L, 0 <= code && code <= MAXUNICODE, arg, "value out of range"); @@ -152,10 +200,47 @@ const codepoint = function(L) { return n; }; +const iter_aux = function(L) { + let s = lauxlib.luaL_checkstring(L, 1); + s = L.stack[lapi.index2addr_(L, 1)].value; + let len = s.length; + let n = lapi.lua_tointeger(L, 2) - 1; + + if (n < 0) /* first iteration? */ + n = 0; /* start from here */ + else if (n < len) { + n++; /* skip current byte */ + while (iscont(s[n])) n++; /* and its continuations */ + } + + if (n >= len) + return 0; /* no more codepoints */ + else { + let dec = utf8_decode(s.slice(n)); + let code = dec ? dec.code : null; + let next = dec ? dec.string : null; + if (next === null || iscont(next[0])) + return lauxlib.luaL_error(L, "invalid UTF-8 code"); + lapi.lua_pushinteger(L, n + 1); + lapi.lua_pushinteger(L, code); + return 2; + } +}; + +const iter_codes = function(L) { + lauxlib.luaL_checkstring(L, 1); + lapi.lua_pushcfunction(L, iter_aux); + lapi.lua_pushvalue(L, 1); + lapi.lua_pushinteger(L, 0); + return 3; +}; + const funcs = { "char": utfchar, "codepoint": codepoint, - "offset": byteoffset, + "codes": iter_codes, + "len": utflen, + "offset": byteoffset }; /* pattern to match a single UTF-8 character */ -- cgit v1.2.3-54-g00ecf