summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md8
-rw-r--r--src/lutf8lib.js89
-rw-r--r--tests/lutf8lib.js69
3 files changed, 157 insertions, 9 deletions
diff --git a/README.md b/README.md
index 573288e..05daa08 100644
--- a/README.md
+++ b/README.md
@@ -211,6 +211,7 @@
- [x] Coroutine
- [x] Table
- [x] Math
+ - [x] utf8
- [ ] String
- [x] string.char
- [x] string.len
@@ -233,13 +234,6 @@
- [ ] os
- [ ] io
- [ ] Debug
- - [ ] utf8
- - [x] utf8.char
- - [x] utf8.codepoint
- - [x] utf8.offset
- - [ ] utf8.charpattern
- - [ ] utf8.codes
- - [ ] utf8.len
- [ ] Run [Lua test suite](https://github.com/lua/tests)
- [ ] DOM API binding
diff --git a/src/lutf8lib.js b/src/lutf8lib.js
index 6ebbec8..a1ce6bb 100644
--- a/src/lutf8lib.js
+++ b/src/lutf8lib.js
@@ -27,6 +27,7 @@ const utf8_decode = function(s, val) {
let limits = [0xFF, 0x7F, 0x7FF, 0xFFFF];
let c = s[0];
let res = 0; /* final result */
+ let pos = 0;
if (c < 0x80) /* ascii? */
res = c;
else {
@@ -42,14 +43,61 @@ const utf8_decode = function(s, val) {
if (count > 3 || res > MAXUNICODE || res <= limits[count])
return null; /* invalid byte sequence */
s = s.slice(count); /* skip continuation bytes read */
+ pos += count;
}
return {
string: s.slice(1), /* +1 to include first byte */
- code: res
+ code: res,
+ pos: pos + 1
};
};
+/*
+** utf8len(s [, i [, j]]) --> number of characters that start in the
+** range [i,j], or nil + current position if 's' is not well formed in
+** that interval
+*/
+// const utflen = function(L) {
+// let n = 0;
+// let s = lauxlib.luaL_checkstring(L, 1);
+// s = L.stack[lapi.index2addr_(L, 1)].value;
+// let len = s.length;
+// let posi = u_posrelat(lauxlib.luaL_optinteger(L, 2, 1), len);
+// let posj = u_posrelat(lauxlib.luaL_optinteger(L, 3, -1), len);
+//
+// lauxlib.luaL_argcheck(L, 1 <= posi && --posi <= len, 2, "initial position out of string");
+// lauxlib.luaL_argcheck(L, --posj < len, 3, "final position out of string");
+//
+// while (posi <= posj) {
+// let dec = utf8_decode(s[posi]);
+// let s1 = dec ? dec.string : null;
+// if (s1 === null) {
+// /* conversion error? */
+// lapi.lua_pushnil(L); /* return nil ... */
+// lapi.lua_pushinteger(L, posi + 1); /* ... and current position */
+// return 2;
+// }
+// posi = dec.pos;
+// n++;
+// }
+// lapi.lua_pushinteger(L, n);
+// return 1;
+// };
+
+// Shorter JSesque solution but doesn't take invalid UTF-8 sequence (but how can we get one ?)
+const utflen = function(L) {
+ let s = lauxlib.luaL_checkstring(L, 1);
+ let posi = u_posrelat(lauxlib.luaL_optinteger(L, 2, 1), s.length);
+ let posj = u_posrelat(lauxlib.luaL_optinteger(L, 3, -1), s.length);
+
+ lauxlib.luaL_argcheck(L, 1 <= posi && --posi <= s.length, 2, "initial position out of string");
+ lauxlib.luaL_argcheck(L, --posj < s.length, 3, "final position out of string");
+
+ lapi.lua_pushinteger(L, s.slice(posi, posj + 1).length);
+ return 1;
+};
+
const pushutfchar = function(L, arg) {
let code = lauxlib.luaL_checkinteger(L, arg);
lauxlib.luaL_argcheck(L, 0 <= code && code <= MAXUNICODE, arg, "value out of range");
@@ -152,10 +200,47 @@ const codepoint = function(L) {
return n;
};
+const iter_aux = function(L) {
+ let s = lauxlib.luaL_checkstring(L, 1);
+ s = L.stack[lapi.index2addr_(L, 1)].value;
+ let len = s.length;
+ let n = lapi.lua_tointeger(L, 2) - 1;
+
+ if (n < 0) /* first iteration? */
+ n = 0; /* start from here */
+ else if (n < len) {
+ n++; /* skip current byte */
+ while (iscont(s[n])) n++; /* and its continuations */
+ }
+
+ if (n >= len)
+ return 0; /* no more codepoints */
+ else {
+ let dec = utf8_decode(s.slice(n));
+ let code = dec ? dec.code : null;
+ let next = dec ? dec.string : null;
+ if (next === null || iscont(next[0]))
+ return lauxlib.luaL_error(L, "invalid UTF-8 code");
+ lapi.lua_pushinteger(L, n + 1);
+ lapi.lua_pushinteger(L, code);
+ return 2;
+ }
+};
+
+const iter_codes = function(L) {
+ lauxlib.luaL_checkstring(L, 1);
+ lapi.lua_pushcfunction(L, iter_aux);
+ lapi.lua_pushvalue(L, 1);
+ lapi.lua_pushinteger(L, 0);
+ return 3;
+};
+
const funcs = {
"char": utfchar,
"codepoint": codepoint,
- "offset": byteoffset,
+ "codes": iter_codes,
+ "len": utflen,
+ "offset": byteoffset
};
/* pattern to match a single UTF-8 character */
diff --git a/tests/lutf8lib.js b/tests/lutf8lib.js
index 7033a57..38d8f8a 100644
--- a/tests/lutf8lib.js
+++ b/tests/lutf8lib.js
@@ -112,4 +112,73 @@ test('utf8.char', function (t) {
"Correct element(s) on the stack"
);
+});
+
+
+test('utf8.len', function (t) {
+ let luaCode = `
+ return utf8.len("( ͡° ͜ʖ ͡° )")
+ `, L;
+
+ t.plan(3);
+
+ t.doesNotThrow(function () {
+
+ L = lauxlib.luaL_newstate();
+
+ linit.luaL_openlibs(L);
+
+ lauxlib.luaL_loadstring(L, luaCode);
+
+ }, "Lua program loaded without error");
+
+ t.doesNotThrow(function () {
+
+ lapi.lua_call(L, 0, -1);
+
+ }, "Lua program ran without error");
+
+ t.strictEqual(
+ lapi.lua_tointeger(L, -1),
+ 12,
+ "Correct element(s) on the stack"
+ );
+
+});
+
+
+test('utf8.codes', function (t) {
+ let luaCode = `
+ local s = "( ͡° ͜ʖ ͡° )"
+ local results = ""
+ for p, c in utf8.codes(s) do
+ results = results .. "[" .. p .. "," .. c .. "] "
+ end
+ return results
+ `, L;
+
+ t.plan(3);
+
+ t.doesNotThrow(function () {
+
+ L = lauxlib.luaL_newstate();
+
+ linit.luaL_openlibs(L);
+
+ lauxlib.luaL_loadstring(L, luaCode);
+
+ }, "Lua program loaded without error");
+
+ t.doesNotThrow(function () {
+
+ lapi.lua_call(L, 0, -1);
+
+ }, "Lua program ran without error");
+
+ t.strictEqual(
+ lapi.lua_tostring(L, -1),
+ "[1,40] [2,32] [3,865] [5,176] [7,32] [8,860] [10,662] [12,32] [13,865] [15,176] [17,32] [18,41] ",
+ "Correct element(s) on the stack"
+ );
+
}); \ No newline at end of file