From 2269fd42e2d44096d368718c314213ff7aafc164 Mon Sep 17 00:00:00 2001
From: Benoit Giannangeli <benoit.giannangeli@boursorama.fr>
Date: Wed, 15 Mar 2017 07:56:37 +0100
Subject: utf8.codepoint

---
 src/lutf8lib.js | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 65 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/lutf8lib.js b/src/lutf8lib.js
index db5f7ad..f2c7f6b 100644
--- a/src/lutf8lib.js
+++ b/src/lutf8lib.js
@@ -6,6 +6,7 @@ const lua     = require('./lua.js');
 const lapi    = require('./lapi.js');
 const lauxlib = require('./lauxlib.js');
 
+const MAXUNICODE = 0x10FFFF;
 
 const iscont = function(p) {
     let c = p & 0xC0;
@@ -19,6 +20,36 @@ const u_posrelat = function(pos, len) {
     else return len + pos + 1;
 };
 
+/*
+** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
+*/
+const utf8_decode = function(s, val) {
+    let limits = [0xFF, 0x7F, 0x7FF, 0xFFFF];
+    let c = s[0];
+    let res = 0;  /* final result */
+    if (c < 0x80)  /* ascii? */
+        res = c;
+    else {
+        let count = 0;  /* to count number of continuation bytes */
+        while (c & 0x40) {  /* still have continuation bytes? */
+            let cc = s[++count];  /* read next byte */
+            if ((cc & 0xC0) != 0x80)  /* not a continuation byte? */
+                return null;  /* invalid byte sequence */
+            res = (res << 6) | (cc & 0x3F);  /* add lower 6 bits from cont. byte */
+            c <<= 1;  /* to test next bit */
+        }
+        res |= ((c & 0x7F) << (count * 5));  /* add first byte */
+        if (count > 3 || res > MAXUNICODE || res <= limits[count])
+            return null;  /* invalid byte sequence */
+        s = s.slice(count);  /* skip continuation bytes read */
+    }
+
+    return {
+        string: s.slice(1),  /* +1 to include first byte */
+        code: res
+    };
+};
+
 /*
 ** offset(s, n, [i])  -> index where n-th character counting from
 **   position 'i' starts; 0 means character at 'i'.
@@ -29,6 +60,7 @@ const byteoffset = function(L) {
     let n = lauxlib.luaL_checkinteger(L, 2);
     let posi = n >= 0 ? 1 : s.length + 1;
     posi = u_posrelat(lauxlib.luaL_optinteger(L, 3, posi), s.length);
+
     lauxlib.luaL_argcheck(L, 1 <= posi && --posi <= s.length, 3, "position ot ouf range");
 
     if (n === 0) {
@@ -64,8 +96,40 @@ const byteoffset = function(L) {
     return 1;
 };
 
+/*
+** codepoint(s, [i, [j]])  -> returns codepoints for all characters
+** that start in the range [i,j]
+*/
+const codepoint = function(L) {
+    let s = lauxlib.luaL_checkstring(L, 1);
+    s = L.stack[lapi.index2addr_(L, 1)].value;
+    let posi = u_posrelat(lauxlib.luaL_optinteger(L, 2, 1), s.length);
+    let pose = u_posrelat(lauxlib.luaL_optinteger(L, 3, posi), s.length);
+
+    lauxlib.luaL_argcheck(L, posi >= 1, 2, "out of range");
+    lauxlib.luaL_argcheck(L, pose <= s.length, 3, "out of range");
+
+    if (posi > pose) return 0;  /* empty interval; return no values */
+    if (pose - posi >= Number.MAX_SAFE_INTEGER)
+        return lauxlib.luaL_error(L, "string slice too long");
+    let n = (pose - posi) + 1;
+    lauxlib.luaL_checkstack(L, n, "string slice too long");
+    n = 0;
+    for (s = s.slice(posi - 1); n < pose - posi;) {
+        let dec = utf8_decode(s);
+        if (dec === null)
+            return lauxlib.luaL_error(L, "invalid UTF-8 code");
+        s = dec.string;
+        let code = dec.code;
+        lapi.lua_pushinteger(L, code);
+        n++;
+    }
+    return n;
+};
+
 const funcs = {
-    "offset": byteoffset
+    "codepoint": codepoint,
+    "offset":    byteoffset
 };
 
 /* pattern to match a single UTF-8 character */
-- 
cgit v1.2.3-70-g09d2