From 2ec10cc9151920aa286a7ae376888d61cf9a3bca Mon Sep 17 00:00:00 2001 From: daurnimator Date: Sun, 12 Nov 2017 14:55:08 +1100 Subject: src/defs.js: Refactor to_jsstring Now fails on non-utf8 input. Previously it would convert to equivalent unicode codepoints as bytes, which did not round-trip --- src/defs.js | 60 ++++++++++++++++++++++++++++------------------------------- tests/defs.js | 8 ++++++++ 2 files changed, 36 insertions(+), 32 deletions(-) diff --git a/src/defs.js b/src/defs.js index 5ebaad9..e55e3dc 100644 --- a/src/defs.js +++ b/src/defs.js @@ -138,43 +138,39 @@ const is_luastring = function(s) { const to_jsstring = function(value, from, to) { assert(is_luastring(value), "jsstring expect a array of bytes"); - let u0, u1, u2, u3, u4, u5; - let idx = 0; value = value.slice(from ? from : 0, to); - var str = ''; - while (1) { - // For UTF8 byte structure, see http://en.wikipedia.org/wiki/UTF-8#Description and https://www.ietf.org/rfc/rfc2279.txt and https://tools.ietf.org/html/rfc3629 - u0 = value[idx++]; - if (u0 === 0) { str += "\0"; continue; } // Lua string embed '\0' - if (!u0) return str; - if (!(u0 & 0x80)) { str += String.fromCharCode(u0); continue; } - u1 = value[idx++] & 63; - if ((u0 & 0xE0) == 0xC0) { str += String.fromCharCode(((u0 & 31) << 6) | u1); continue; } - u2 = value[idx++] & 63; - if ((u0 & 0xF0) == 0xE0) { - u0 = ((u0 & 15) << 12) | (u1 << 6) | u2; - } else { - u3 = value[idx++] & 63; - if ((u0 & 0xF8) == 0xF0) { - u0 = ((u0 & 7) << 18) | (u1 << 12) | (u2 << 6) | u3; - } else { - u4 = value[idx++] & 63; - if ((u0 & 0xFC) == 0xF8) { - u0 = ((u0 & 3) << 24) | (u1 << 18) | (u2 << 12) | (u3 << 6) | u4; - } else { - u5 = value[idx++] & 63; - u0 = ((u0 & 1) << 30) | (u1 << 24) | (u2 << 18) | (u3 << 12) | (u4 << 6) | u5; - } - } - } - if (u0 < 0x10000) { - str += String.fromCharCode(u0); + let str = ""; + for (let i = 0; i < value.length;) { + let u; + let u0 = value[i++]; + if (u0 < 0x80) { + /* single byte sequence */ + u = u0; + } else if (u0 < 0xC2 || u0 > 0xF4) { + throw RangeError("cannot convert invalid utf8 to javascript string"); + } else if (u0 <= 0xDF) { + /* two byte sequence */ + if (i >= value.length) throw RangeError("cannot convert invalid utf8 to javascript string"); + let u1 = value[i++]; + u = ((u0 & 0x1F) << 6) + (u1 & 0x3F); + } else if (u0 <= 0xEF) { + /* three byte sequence */ + if (i+1 >= value.length) throw RangeError("cannot convert invalid utf8 to javascript string"); + let u1 = value[i++]; + let u2 = value[i++]; + u = ((u0 & 0x0F) << 12) + ((u1 & 0x3F) << 6) + (u2 & 0x3F); } else { - var ch = u0 - 0x10000; - str += String.fromCharCode(0xD800 | (ch >> 10), 0xDC00 | (ch & 0x3FF)); + /* four byte sequence */ + if (i+2 >= value.length) throw RangeError("cannot convert invalid utf8 to javascript string"); + let u1 = value[i++]; + let u2 = value[i++]; + let u3 = value[i++]; + u = ((u0 & 0x07) << 18) + ((u1 & 0x3F) << 12) + ((u2 & 0x3F) << 6) + (u3 & 0x3F); } + str += String.fromCodePoint(u); } + return str; }; const to_luastring_cache = {}; diff --git a/tests/defs.js b/tests/defs.js index 07e863b..b00fda0 100644 --- a/tests/defs.js +++ b/tests/defs.js @@ -56,3 +56,11 @@ test('to_jsstring', function (t) { t.deepEqual(defs.to_jsstring(v.byte_array), v.literal, v.description); }); }); + +test('to_jsstring fails on invalid unicode', function (t) { + t.plan(1); + + t.throws(function() { + defs.to_jsstring([165]); + }, "non-utf8 char"); +}); -- cgit v1.2.3-70-g09d2