summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/lutf8lib.js66
-rw-r--r--tests/lutf8lib.js44
2 files changed, 109 insertions, 1 deletions
diff --git a/src/lutf8lib.js b/src/lutf8lib.js
index db5f7ad..f2c7f6b 100644
--- a/src/lutf8lib.js
+++ b/src/lutf8lib.js
@@ -6,6 +6,7 @@ const lua = require('./lua.js');
const lapi = require('./lapi.js');
const lauxlib = require('./lauxlib.js');
+const MAXUNICODE = 0x10FFFF;
const iscont = function(p) {
let c = p & 0xC0;
@@ -20,6 +21,36 @@ const u_posrelat = function(pos, len) {
};
/*
+** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
+*/
+const utf8_decode = function(s, val) {
+ let limits = [0xFF, 0x7F, 0x7FF, 0xFFFF];
+ let c = s[0];
+ let res = 0; /* final result */
+ if (c < 0x80) /* ascii? */
+ res = c;
+ else {
+ let count = 0; /* to count number of continuation bytes */
+ while (c & 0x40) { /* still have continuation bytes? */
+ let cc = s[++count]; /* read next byte */
+ if ((cc & 0xC0) != 0x80) /* not a continuation byte? */
+ return null; /* invalid byte sequence */
+ res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */
+ c <<= 1; /* to test next bit */
+ }
+ res |= ((c & 0x7F) << (count * 5)); /* add first byte */
+ if (count > 3 || res > MAXUNICODE || res <= limits[count])
+ return null; /* invalid byte sequence */
+ s = s.slice(count); /* skip continuation bytes read */
+ }
+
+ return {
+ string: s.slice(1), /* +1 to include first byte */
+ code: res
+ };
+};
+
+/*
** offset(s, n, [i]) -> index where n-th character counting from
** position 'i' starts; 0 means character at 'i'.
*/
@@ -29,6 +60,7 @@ const byteoffset = function(L) {
let n = lauxlib.luaL_checkinteger(L, 2);
let posi = n >= 0 ? 1 : s.length + 1;
posi = u_posrelat(lauxlib.luaL_optinteger(L, 3, posi), s.length);
+
lauxlib.luaL_argcheck(L, 1 <= posi && --posi <= s.length, 3, "position ot ouf range");
if (n === 0) {
@@ -64,8 +96,40 @@ const byteoffset = function(L) {
return 1;
};
+/*
+** codepoint(s, [i, [j]]) -> returns codepoints for all characters
+** that start in the range [i,j]
+*/
+const codepoint = function(L) {
+ let s = lauxlib.luaL_checkstring(L, 1);
+ s = L.stack[lapi.index2addr_(L, 1)].value;
+ let posi = u_posrelat(lauxlib.luaL_optinteger(L, 2, 1), s.length);
+ let pose = u_posrelat(lauxlib.luaL_optinteger(L, 3, posi), s.length);
+
+ lauxlib.luaL_argcheck(L, posi >= 1, 2, "out of range");
+ lauxlib.luaL_argcheck(L, pose <= s.length, 3, "out of range");
+
+ if (posi > pose) return 0; /* empty interval; return no values */
+ if (pose - posi >= Number.MAX_SAFE_INTEGER)
+ return lauxlib.luaL_error(L, "string slice too long");
+ let n = (pose - posi) + 1;
+ lauxlib.luaL_checkstack(L, n, "string slice too long");
+ n = 0;
+ for (s = s.slice(posi - 1); n < pose - posi;) {
+ let dec = utf8_decode(s);
+ if (dec === null)
+ return lauxlib.luaL_error(L, "invalid UTF-8 code");
+ s = dec.string;
+ let code = dec.code;
+ lapi.lua_pushinteger(L, code);
+ n++;
+ }
+ return n;
+};
+
const funcs = {
- "offset": byteoffset
+ "codepoint": codepoint,
+ "offset": byteoffset
};
/* pattern to match a single UTF-8 character */
diff --git a/tests/lutf8lib.js b/tests/lutf8lib.js
index 3006bd4..edfae56 100644
--- a/tests/lutf8lib.js
+++ b/tests/lutf8lib.js
@@ -36,4 +36,48 @@ test('utf8.offset', function (t) {
"Correct element(s) on the stack"
);
+});
+
+
+test('utf8.codepoint', function (t) {
+ let luaCode = `
+ return utf8.codepoint("( ͡° ͜ʖ ͡° )", 5, 8)
+ `, L;
+
+ t.plan(5);
+
+ t.doesNotThrow(function () {
+
+ L = lauxlib.luaL_newstate();
+
+ linit.luaL_openlibs(L);
+
+ lauxlib.luaL_loadstring(L, luaCode);
+
+ }, "Lua program loaded without error");
+
+ t.doesNotThrow(function () {
+
+ lapi.lua_call(L, 0, -1);
+
+ }, "Lua program ran without error");
+
+ t.strictEqual(
+ lapi.lua_tointeger(L, -3),
+ 176,
+ "Correct element(s) on the stack"
+ );
+
+ t.strictEqual(
+ lapi.lua_tointeger(L, -2),
+ 32,
+ "Correct element(s) on the stack"
+ );
+
+ t.strictEqual(
+ lapi.lua_tointeger(L, -1),
+ 860,
+ "Correct element(s) on the stack"
+ );
+
}); \ No newline at end of file