From 1a158cc176055a50df492dfa39c1b93e9e5b478a Mon Sep 17 00:00:00 2001
From: Benoit Giannangeli <benoit.giannangeli@boursorama.fr>
Date: Wed, 15 Mar 2017 08:42:55 +0100
Subject: utf8.len, utf8.codes

---
 README.md         |  8 +----
 src/lutf8lib.js   | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 tests/lutf8lib.js | 69 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 157 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 573288e..05daa08 100644
--- a/README.md
+++ b/README.md
@@ -211,6 +211,7 @@
     - [x] Coroutine
     - [x] Table
     - [x] Math
+    - [x] utf8
     - [ ] String
         - [x] string.char
         - [x] string.len
@@ -233,13 +234,6 @@
     - [ ] os
     - [ ] io
     - [ ] Debug
-    - [ ] utf8
-        - [x] utf8.char
-        - [x] utf8.codepoint
-        - [x] utf8.offset
-        - [ ] utf8.charpattern
-        - [ ] utf8.codes
-        - [ ] utf8.len
 - [ ] Run [Lua test suite](https://github.com/lua/tests)
 - [ ] DOM API binding
 
diff --git a/src/lutf8lib.js b/src/lutf8lib.js
index 6ebbec8..a1ce6bb 100644
--- a/src/lutf8lib.js
+++ b/src/lutf8lib.js
@@ -27,6 +27,7 @@ const utf8_decode = function(s, val) {
     let limits = [0xFF, 0x7F, 0x7FF, 0xFFFF];
     let c = s[0];
     let res = 0;  /* final result */
+    let pos = 0;
     if (c < 0x80)  /* ascii? */
         res = c;
     else {
@@ -42,14 +43,61 @@ const utf8_decode = function(s, val) {
         if (count > 3 || res > MAXUNICODE || res <= limits[count])
             return null;  /* invalid byte sequence */
         s = s.slice(count);  /* skip continuation bytes read */
+        pos += count;
     }
 
     return {
         string: s.slice(1),  /* +1 to include first byte */
-        code: res
+        code: res,
+        pos: pos + 1
     };
 };
 
+/*
+** utf8len(s [, i [, j]]) --> number of characters that start in the
+** range [i,j], or nil + current position if 's' is not well formed in
+** that interval
+*/
+// const utflen = function(L) {
+//     let n = 0;
+//     let s = lauxlib.luaL_checkstring(L, 1);
+//     s = L.stack[lapi.index2addr_(L, 1)].value;
+//     let len = s.length;
+//     let posi = u_posrelat(lauxlib.luaL_optinteger(L, 2, 1), len);
+//     let posj = u_posrelat(lauxlib.luaL_optinteger(L, 3, -1), len);
+//
+//     lauxlib.luaL_argcheck(L, 1 <= posi && --posi <= len, 2, "initial position out of string");
+//     lauxlib.luaL_argcheck(L, --posj < len, 3, "final position out of string");
+//
+//     while (posi <= posj) {
+//         let dec = utf8_decode(s[posi]);
+//         let s1 = dec ? dec.string : null;
+//         if (s1 === null) {
+//             /* conversion error? */
+//             lapi.lua_pushnil(L);  /* return nil ... */
+//             lapi.lua_pushinteger(L, posi + 1);  /* ... and current position */
+//             return 2;
+//         }
+//         posi = dec.pos;
+//         n++;
+//     }
+//     lapi.lua_pushinteger(L, n);
+//     return 1;
+// };
+
+// Shorter JSesque solution but doesn't take invalid UTF-8 sequence (but how can we get one ?)
+const utflen = function(L) {
+    let s = lauxlib.luaL_checkstring(L, 1);
+    let posi = u_posrelat(lauxlib.luaL_optinteger(L, 2, 1), s.length);
+    let posj = u_posrelat(lauxlib.luaL_optinteger(L, 3, -1), s.length);
+
+    lauxlib.luaL_argcheck(L, 1 <= posi && --posi <= s.length, 2, "initial position out of string");
+    lauxlib.luaL_argcheck(L, --posj < s.length, 3, "final position out of string");
+
+    lapi.lua_pushinteger(L, s.slice(posi, posj + 1).length);
+    return 1;
+};
+
 const pushutfchar = function(L, arg) {
     let code = lauxlib.luaL_checkinteger(L, arg);
     lauxlib.luaL_argcheck(L, 0 <= code && code <= MAXUNICODE, arg, "value out of range");
@@ -152,10 +200,47 @@ const codepoint = function(L) {
     return n;
 };
 
+const iter_aux = function(L) {
+    let s = lauxlib.luaL_checkstring(L, 1);
+    s = L.stack[lapi.index2addr_(L, 1)].value;
+    let len = s.length;
+    let n = lapi.lua_tointeger(L, 2) - 1;
+
+    if (n < 0)  /* first iteration? */
+        n = 0;  /* start from here */
+    else if (n < len) {
+        n++;  /* skip current byte */
+        while (iscont(s[n])) n++;  /* and its continuations */
+    }
+
+    if (n >= len)
+        return 0;  /* no more codepoints */
+    else {
+        let dec = utf8_decode(s.slice(n));
+        let code = dec ? dec.code : null;
+        let next = dec ? dec.string : null;
+        if (next === null || iscont(next[0]))
+            return lauxlib.luaL_error(L, "invalid UTF-8 code");
+        lapi.lua_pushinteger(L, n + 1);
+        lapi.lua_pushinteger(L, code);
+        return 2;
+    }
+};
+
+const iter_codes = function(L) {
+    lauxlib.luaL_checkstring(L, 1);
+    lapi.lua_pushcfunction(L, iter_aux);
+    lapi.lua_pushvalue(L, 1);
+    lapi.lua_pushinteger(L, 0);
+    return 3;
+};
+
 const funcs = {
     "char":      utfchar,
     "codepoint": codepoint,
-    "offset":    byteoffset,
+    "codes":     iter_codes,
+    "len":       utflen,
+    "offset":    byteoffset
 };
 
 /* pattern to match a single UTF-8 character */
diff --git a/tests/lutf8lib.js b/tests/lutf8lib.js
index 7033a57..38d8f8a 100644
--- a/tests/lutf8lib.js
+++ b/tests/lutf8lib.js
@@ -112,4 +112,73 @@ test('utf8.char', function (t) {
         "Correct element(s) on the stack"
     );
 
+});
+
+
+test('utf8.len', function (t) {
+    let luaCode = `
+        return utf8.len("( ͡° ͜ʖ ͡° )")
+    `, L;
+    
+    t.plan(3);
+
+    t.doesNotThrow(function () {
+
+        L = lauxlib.luaL_newstate();
+
+        linit.luaL_openlibs(L);
+
+        lauxlib.luaL_loadstring(L, luaCode);
+
+    }, "Lua program loaded without error");
+
+    t.doesNotThrow(function () {
+
+        lapi.lua_call(L, 0, -1);
+
+    }, "Lua program ran without error");
+
+    t.strictEqual(
+        lapi.lua_tointeger(L, -1),
+        12,
+        "Correct element(s) on the stack"
+    );
+
+});
+
+
+test('utf8.codes', function (t) {
+    let luaCode = `
+        local s = "( ͡° ͜ʖ ͡° )"
+        local results = ""
+        for p, c in utf8.codes(s) do
+            results = results .. "[" .. p .. "," .. c .. "] "
+        end
+        return results
+    `, L;
+    
+    t.plan(3);
+
+    t.doesNotThrow(function () {
+
+        L = lauxlib.luaL_newstate();
+
+        linit.luaL_openlibs(L);
+
+        lauxlib.luaL_loadstring(L, luaCode);
+
+    }, "Lua program loaded without error");
+
+    t.doesNotThrow(function () {
+
+        lapi.lua_call(L, 0, -1);
+
+    }, "Lua program ran without error");
+
+    t.strictEqual(
+        lapi.lua_tostring(L, -1),
+        "[1,40] [2,32] [3,865] [5,176] [7,32] [8,860] [10,662] [12,32] [13,865] [15,176] [17,32] [18,41] ",
+        "Correct element(s) on the stack"
+    );
+
 });
\ No newline at end of file
-- 
cgit v1.2.3-70-g09d2