utf8.len, utf8.codes

author: Benoit Giannangeli <benoit.giannangeli@boursorama.fr> 2017-03-15 08:42:55 +0100
committer: Benoit Giannangeli <benoit.giannangeli@boursorama.fr> 2017-03-15 08:42:55 +0100
commit: 1a158cc176055a50df492dfa39c1b93e9e5b478a (patch)
tree: 1a6625ab9ed29379158072f0352e61450e3890a4 /src/lutf8lib.js
parent: dbbbb1a029fa3d6ad346426bee3310191e0dd285 (diff)
download: fengari-1a158cc176055a50df492dfa39c1b93e9e5b478a.tar.gz
fengari-1a158cc176055a50df492dfa39c1b93e9e5b478a.tar.bz2
fengari-1a158cc176055a50df492dfa39c1b93e9e5b478a.zip
1 files changed, 87 insertions, 2 deletions
diff --git a/src/lutf8lib.js b/src/lutf8lib.js
index 6ebbec8..a1ce6bb 100644
--- a/src/lutf8lib.js
+++ b/src/lutf8lib.js
@@ -27,6 +27,7 @@ const utf8_decode = function(s, val) {
     let limits = [0xFF, 0x7F, 0x7FF, 0xFFFF];
     let c = s[0];
     let res = 0;  /* final result */
+    let pos = 0;
     if (c < 0x80)  /* ascii? */
         res = c;
     else {
@@ -42,14 +43,61 @@ const utf8_decode = function(s, val) {
         if (count > 3 || res > MAXUNICODE || res <= limits[count])
             return null;  /* invalid byte sequence */
         s = s.slice(count);  /* skip continuation bytes read */
+        pos += count;
     }
 
     return {
         string: s.slice(1),  /* +1 to include first byte */
-        code: res
+        code: res,
+        pos: pos + 1
     };
 };
 
+/*
+** utf8len(s [, i [, j]]) --> number of characters that start in the
+** range [i,j], or nil + current position if 's' is not well formed in
+** that interval
+*/
+// const utflen = function(L) {
+//     let n = 0;
+//     let s = lauxlib.luaL_checkstring(L, 1);
+//     s = L.stack[lapi.index2addr_(L, 1)].value;
+//     let len = s.length;
+//     let posi = u_posrelat(lauxlib.luaL_optinteger(L, 2, 1), len);
+//     let posj = u_posrelat(lauxlib.luaL_optinteger(L, 3, -1), len);
+//
+//     lauxlib.luaL_argcheck(L, 1 <= posi && --posi <= len, 2, "initial position out of string");
+//     lauxlib.luaL_argcheck(L, --posj < len, 3, "final position out of string");
+//
+//     while (posi <= posj) {
+//         let dec = utf8_decode(s[posi]);
+//         let s1 = dec ? dec.string : null;
+//         if (s1 === null) {
+//             /* conversion error? */
+//             lapi.lua_pushnil(L);  /* return nil ... */
+//             lapi.lua_pushinteger(L, posi + 1);  /* ... and current position */
+//             return 2;
+//         }
+//         posi = dec.pos;
+//         n++;
+//     }
+//     lapi.lua_pushinteger(L, n);
+//     return 1;
+// };
+
+// Shorter JSesque solution but doesn't take invalid UTF-8 sequence (but how can we get one ?)
+const utflen = function(L) {
+    let s = lauxlib.luaL_checkstring(L, 1);
+    let posi = u_posrelat(lauxlib.luaL_optinteger(L, 2, 1), s.length);
+    let posj = u_posrelat(lauxlib.luaL_optinteger(L, 3, -1), s.length);
+
+    lauxlib.luaL_argcheck(L, 1 <= posi && --posi <= s.length, 2, "initial position out of string");
+    lauxlib.luaL_argcheck(L, --posj < s.length, 3, "final position out of string");
+
+    lapi.lua_pushinteger(L, s.slice(posi, posj + 1).length);
+    return 1;
+};
+
 const pushutfchar = function(L, arg) {
     let code = lauxlib.luaL_checkinteger(L, arg);
     lauxlib.luaL_argcheck(L, 0 <= code && code <= MAXUNICODE, arg, "value out of range");
@@ -152,10 +200,47 @@ const codepoint = function(L) {
     return n;
 };
 
+const iter_aux = function(L) {
+    let s = lauxlib.luaL_checkstring(L, 1);
+    s = L.stack[lapi.index2addr_(L, 1)].value;
+    let len = s.length;
+    let n = lapi.lua_tointeger(L, 2) - 1;
+
+    if (n < 0)  /* first iteration? */
+        n = 0;  /* start from here */
+    else if (n < len) {
+        n++;  /* skip current byte */
+        while (iscont(s[n])) n++;  /* and its continuations */
+    }
+
+    if (n >= len)
+        return 0;  /* no more codepoints */
+    else {
+        let dec = utf8_decode(s.slice(n));
+        let code = dec ? dec.code : null;
+        let next = dec ? dec.string : null;
+        if (next === null || iscont(next[0]))
+            return lauxlib.luaL_error(L, "invalid UTF-8 code");
+        lapi.lua_pushinteger(L, n + 1);
+        lapi.lua_pushinteger(L, code);
+        return 2;
+    }
+};
+
+const iter_codes = function(L) {
+    lauxlib.luaL_checkstring(L, 1);
+    lapi.lua_pushcfunction(L, iter_aux);
+    lapi.lua_pushvalue(L, 1);
+    lapi.lua_pushinteger(L, 0);
+    return 3;
+};
+
 const funcs = {
     "char":      utfchar,
     "codepoint": codepoint,
-    "offset":    byteoffset,
+    "codes":     iter_codes,
+    "len":       utflen,
+    "offset":    byteoffset
 };
 
 /* pattern to match a single UTF-8 character */
author	Benoit Giannangeli <benoit.giannangeli@boursorama.fr>	2017-03-15 08:42:55 +0100
committer	Benoit Giannangeli <benoit.giannangeli@boursorama.fr>	2017-03-15 08:42:55 +0100
commit	1a158cc176055a50df492dfa39c1b93e9e5b478a (patch)
tree	1a6625ab9ed29379158072f0352e61450e3890a4 /src/lutf8lib.js
parent	dbbbb1a029fa3d6ad346426bee3310191e0dd285 (diff)
download	fengari-1a158cc176055a50df492dfa39c1b93e9e5b478a.tar.gz fengari-1a158cc176055a50df492dfa39c1b93e9e5b478a.tar.bz2 fengari-1a158cc176055a50df492dfa39c1b93e9e5b478a.zip