diff options
author | Benoit Giannangeli <benoit.giannangeli@boursorama.fr> | 2017-03-15 07:56:37 +0100 |
---|---|---|
committer | Benoit Giannangeli <benoit.giannangeli@boursorama.fr> | 2017-03-15 07:56:37 +0100 |
commit | 2269fd42e2d44096d368718c314213ff7aafc164 (patch) | |
tree | cbeebb540a37e0e81dc1c85cdac56792291e8623 /src/lutf8lib.js | |
parent | dd44bc89e607f7e395233e03758ac8e83a8ccc7a (diff) | |
download | fengari-2269fd42e2d44096d368718c314213ff7aafc164.tar.gz fengari-2269fd42e2d44096d368718c314213ff7aafc164.tar.bz2 fengari-2269fd42e2d44096d368718c314213ff7aafc164.zip |
utf8.codepoint
Diffstat (limited to 'src/lutf8lib.js')
-rw-r--r-- | src/lutf8lib.js | 66 |
1 files changed, 65 insertions, 1 deletions
diff --git a/src/lutf8lib.js b/src/lutf8lib.js index db5f7ad..f2c7f6b 100644 --- a/src/lutf8lib.js +++ b/src/lutf8lib.js @@ -6,6 +6,7 @@ const lua = require('./lua.js'); const lapi = require('./lapi.js'); const lauxlib = require('./lauxlib.js'); +const MAXUNICODE = 0x10FFFF; const iscont = function(p) { let c = p & 0xC0; @@ -20,6 +21,36 @@ const u_posrelat = function(pos, len) { }; /* +** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid. +*/ +const utf8_decode = function(s, val) { + let limits = [0xFF, 0x7F, 0x7FF, 0xFFFF]; + let c = s[0]; + let res = 0; /* final result */ + if (c < 0x80) /* ascii? */ + res = c; + else { + let count = 0; /* to count number of continuation bytes */ + while (c & 0x40) { /* still have continuation bytes? */ + let cc = s[++count]; /* read next byte */ + if ((cc & 0xC0) != 0x80) /* not a continuation byte? */ + return null; /* invalid byte sequence */ + res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ + c <<= 1; /* to test next bit */ + } + res |= ((c & 0x7F) << (count * 5)); /* add first byte */ + if (count > 3 || res > MAXUNICODE || res <= limits[count]) + return null; /* invalid byte sequence */ + s = s.slice(count); /* skip continuation bytes read */ + } + + return { + string: s.slice(1), /* +1 to include first byte */ + code: res + }; +}; + +/* ** offset(s, n, [i]) -> index where n-th character counting from ** position 'i' starts; 0 means character at 'i'. */ @@ -29,6 +60,7 @@ const byteoffset = function(L) { let n = lauxlib.luaL_checkinteger(L, 2); let posi = n >= 0 ? 1 : s.length + 1; posi = u_posrelat(lauxlib.luaL_optinteger(L, 3, posi), s.length); + lauxlib.luaL_argcheck(L, 1 <= posi && --posi <= s.length, 3, "position ot ouf range"); if (n === 0) { @@ -64,8 +96,40 @@ const byteoffset = function(L) { return 1; }; +/* +** codepoint(s, [i, [j]]) -> returns codepoints for all characters +** that start in the range [i,j] +*/ +const codepoint = function(L) { + let s = lauxlib.luaL_checkstring(L, 1); + s = L.stack[lapi.index2addr_(L, 1)].value; + let posi = u_posrelat(lauxlib.luaL_optinteger(L, 2, 1), s.length); + let pose = u_posrelat(lauxlib.luaL_optinteger(L, 3, posi), s.length); + + lauxlib.luaL_argcheck(L, posi >= 1, 2, "out of range"); + lauxlib.luaL_argcheck(L, pose <= s.length, 3, "out of range"); + + if (posi > pose) return 0; /* empty interval; return no values */ + if (pose - posi >= Number.MAX_SAFE_INTEGER) + return lauxlib.luaL_error(L, "string slice too long"); + let n = (pose - posi) + 1; + lauxlib.luaL_checkstack(L, n, "string slice too long"); + n = 0; + for (s = s.slice(posi - 1); n < pose - posi;) { + let dec = utf8_decode(s); + if (dec === null) + return lauxlib.luaL_error(L, "invalid UTF-8 code"); + s = dec.string; + let code = dec.code; + lapi.lua_pushinteger(L, code); + n++; + } + return n; +}; + const funcs = { - "offset": byteoffset + "codepoint": codepoint, + "offset": byteoffset }; /* pattern to match a single UTF-8 character */ |