From 1a158cc176055a50df492dfa39c1b93e9e5b478a Mon Sep 17 00:00:00 2001 From: Benoit Giannangeli Date: Wed, 15 Mar 2017 08:42:55 +0100 Subject: utf8.len, utf8.codes --- README.md | 8 +---- src/lutf8lib.js | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- tests/lutf8lib.js | 69 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 157 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 573288e..05daa08 100644 --- a/README.md +++ b/README.md @@ -211,6 +211,7 @@ - [x] Coroutine - [x] Table - [x] Math + - [x] utf8 - [ ] String - [x] string.char - [x] string.len @@ -233,13 +234,6 @@ - [ ] os - [ ] io - [ ] Debug - - [ ] utf8 - - [x] utf8.char - - [x] utf8.codepoint - - [x] utf8.offset - - [ ] utf8.charpattern - - [ ] utf8.codes - - [ ] utf8.len - [ ] Run [Lua test suite](https://github.com/lua/tests) - [ ] DOM API binding diff --git a/src/lutf8lib.js b/src/lutf8lib.js index 6ebbec8..a1ce6bb 100644 --- a/src/lutf8lib.js +++ b/src/lutf8lib.js @@ -27,6 +27,7 @@ const utf8_decode = function(s, val) { let limits = [0xFF, 0x7F, 0x7FF, 0xFFFF]; let c = s[0]; let res = 0; /* final result */ + let pos = 0; if (c < 0x80) /* ascii? */ res = c; else { @@ -42,14 +43,61 @@ const utf8_decode = function(s, val) { if (count > 3 || res > MAXUNICODE || res <= limits[count]) return null; /* invalid byte sequence */ s = s.slice(count); /* skip continuation bytes read */ + pos += count; } return { string: s.slice(1), /* +1 to include first byte */ - code: res + code: res, + pos: pos + 1 }; }; +/* +** utf8len(s [, i [, j]]) --> number of characters that start in the +** range [i,j], or nil + current position if 's' is not well formed in +** that interval +*/ +// const utflen = function(L) { +// let n = 0; +// let s = lauxlib.luaL_checkstring(L, 1); +// s = L.stack[lapi.index2addr_(L, 1)].value; +// let len = s.length; +// let posi = u_posrelat(lauxlib.luaL_optinteger(L, 2, 1), len); +// let posj = u_posrelat(lauxlib.luaL_optinteger(L, 3, -1), len); +// +// lauxlib.luaL_argcheck(L, 1 <= posi && --posi <= len, 2, "initial position out of string"); +// lauxlib.luaL_argcheck(L, --posj < len, 3, "final position out of string"); +// +// while (posi <= posj) { +// let dec = utf8_decode(s[posi]); +// let s1 = dec ? dec.string : null; +// if (s1 === null) { +// /* conversion error? */ +// lapi.lua_pushnil(L); /* return nil ... */ +// lapi.lua_pushinteger(L, posi + 1); /* ... and current position */ +// return 2; +// } +// posi = dec.pos; +// n++; +// } +// lapi.lua_pushinteger(L, n); +// return 1; +// }; + +// Shorter JSesque solution but doesn't take invalid UTF-8 sequence (but how can we get one ?) +const utflen = function(L) { + let s = lauxlib.luaL_checkstring(L, 1); + let posi = u_posrelat(lauxlib.luaL_optinteger(L, 2, 1), s.length); + let posj = u_posrelat(lauxlib.luaL_optinteger(L, 3, -1), s.length); + + lauxlib.luaL_argcheck(L, 1 <= posi && --posi <= s.length, 2, "initial position out of string"); + lauxlib.luaL_argcheck(L, --posj < s.length, 3, "final position out of string"); + + lapi.lua_pushinteger(L, s.slice(posi, posj + 1).length); + return 1; +}; + const pushutfchar = function(L, arg) { let code = lauxlib.luaL_checkinteger(L, arg); lauxlib.luaL_argcheck(L, 0 <= code && code <= MAXUNICODE, arg, "value out of range"); @@ -152,10 +200,47 @@ const codepoint = function(L) { return n; }; +const iter_aux = function(L) { + let s = lauxlib.luaL_checkstring(L, 1); + s = L.stack[lapi.index2addr_(L, 1)].value; + let len = s.length; + let n = lapi.lua_tointeger(L, 2) - 1; + + if (n < 0) /* first iteration? */ + n = 0; /* start from here */ + else if (n < len) { + n++; /* skip current byte */ + while (iscont(s[n])) n++; /* and its continuations */ + } + + if (n >= len) + return 0; /* no more codepoints */ + else { + let dec = utf8_decode(s.slice(n)); + let code = dec ? dec.code : null; + let next = dec ? dec.string : null; + if (next === null || iscont(next[0])) + return lauxlib.luaL_error(L, "invalid UTF-8 code"); + lapi.lua_pushinteger(L, n + 1); + lapi.lua_pushinteger(L, code); + return 2; + } +}; + +const iter_codes = function(L) { + lauxlib.luaL_checkstring(L, 1); + lapi.lua_pushcfunction(L, iter_aux); + lapi.lua_pushvalue(L, 1); + lapi.lua_pushinteger(L, 0); + return 3; +}; + const funcs = { "char": utfchar, "codepoint": codepoint, - "offset": byteoffset, + "codes": iter_codes, + "len": utflen, + "offset": byteoffset }; /* pattern to match a single UTF-8 character */ diff --git a/tests/lutf8lib.js b/tests/lutf8lib.js index 7033a57..38d8f8a 100644 --- a/tests/lutf8lib.js +++ b/tests/lutf8lib.js @@ -112,4 +112,73 @@ test('utf8.char', function (t) { "Correct element(s) on the stack" ); +}); + + +test('utf8.len', function (t) { + let luaCode = ` + return utf8.len("( ͡° ͜ʖ ͡° )") + `, L; + + t.plan(3); + + t.doesNotThrow(function () { + + L = lauxlib.luaL_newstate(); + + linit.luaL_openlibs(L); + + lauxlib.luaL_loadstring(L, luaCode); + + }, "Lua program loaded without error"); + + t.doesNotThrow(function () { + + lapi.lua_call(L, 0, -1); + + }, "Lua program ran without error"); + + t.strictEqual( + lapi.lua_tointeger(L, -1), + 12, + "Correct element(s) on the stack" + ); + +}); + + +test('utf8.codes', function (t) { + let luaCode = ` + local s = "( ͡° ͜ʖ ͡° )" + local results = "" + for p, c in utf8.codes(s) do + results = results .. "[" .. p .. "," .. c .. "] " + end + return results + `, L; + + t.plan(3); + + t.doesNotThrow(function () { + + L = lauxlib.luaL_newstate(); + + linit.luaL_openlibs(L); + + lauxlib.luaL_loadstring(L, luaCode); + + }, "Lua program loaded without error"); + + t.doesNotThrow(function () { + + lapi.lua_call(L, 0, -1); + + }, "Lua program ran without error"); + + t.strictEqual( + lapi.lua_tostring(L, -1), + "[1,40] [2,32] [3,865] [5,176] [7,32] [8,860] [10,662] [12,32] [13,865] [15,176] [17,32] [18,41] ", + "Correct element(s) on the stack" + ); + }); \ No newline at end of file -- cgit v1.2.3-54-g00ecf