Skip to content

Commit 3564848

Browse files
authored
Add string conversion error modes (#1902)
1 parent 3a76daf commit 3564848

23 files changed

+5214
-4087
lines changed

lib/loader/index.js

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -37,17 +37,24 @@ const ARRAY_SIZE = 16;
3737
const BIGINT = typeof BigUint64Array !== "undefined";
3838
const THIS = Symbol();
3939

40-
const STRING_DECODE_THRESHOLD = 32;
41-
const decoder = new TextDecoder("utf-16le");
40+
const STRING_SMALLSIZE = 192; // break-even point in V8
41+
const STRING_CHUNKSIZE = 1024; // mitigate stack overflow
42+
const utf16 = new TextDecoder("utf-16le", { fatal: true }); // != wtf16
4243

43-
/** Gets a string from an U32 and an U16 view on a memory. */
44+
/** Gets a string from memory. */
4445
function getStringImpl(buffer, ptr) {
45-
const len = new Uint32Array(buffer)[ptr + SIZE_OFFSET >>> 2] >>> 1;
46-
const arr = new Uint16Array(buffer, ptr, len);
47-
if (len <= STRING_DECODE_THRESHOLD) {
48-
return String.fromCharCode.apply(String, arr);
46+
let len = new Uint32Array(buffer)[ptr + SIZE_OFFSET >>> 2] >>> 1;
47+
const wtf16 = new Uint16Array(buffer, ptr, len);
48+
if (len <= STRING_SMALLSIZE) return String.fromCharCode(...wtf16);
49+
try {
50+
return utf16.decode(wtf16);
51+
} catch {
52+
let str = "", off = 0;
53+
while (len - off > STRING_CHUNKSIZE) {
54+
str += String.fromCharCode(...wtf16.subarray(off, off += STRING_CHUNKSIZE));
55+
}
56+
return str + String.fromCharCode(...wtf16.subarray(off));
4957
}
50-
return decoder.decode(arr);
5158
}
5259

5360
/** Prepares the base module prior to instantiation. */

lib/loader/package.json

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,5 +43,8 @@
4343
"umd/index.js",
4444
"umd/package.json",
4545
"README.md"
46-
]
47-
}
46+
],
47+
"devDependencies": {
48+
"esm2umd": "^0.1.2"
49+
}
50+
}

lib/loader/tests/build/default.wasm

1.23 KB
Binary file not shown.

lib/loader/tests/build/legacy.wasm

1.23 KB
Binary file not shown.

lib/loader/umd/index.js

Lines changed: 30 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -44,19 +44,33 @@ var loader = (function(exports) {
4444
const ARRAY_SIZE = 16;
4545
const BIGINT = typeof BigUint64Array !== "undefined";
4646
const THIS = Symbol();
47-
const STRING_DECODE_THRESHOLD = 32;
48-
const decoder = new TextDecoder("utf-16le");
49-
/** Gets a string from an U32 and an U16 view on a memory. */
47+
const STRING_SMALLSIZE = 192; // break-even point in V8
48+
49+
const STRING_CHUNKSIZE = 1024; // mitigate stack overflow
50+
51+
const utf16 = new TextDecoder("utf-16le", {
52+
fatal: true
53+
}); // != wtf16
54+
55+
/** Gets a string from memory. */
5056

5157
function getStringImpl(buffer, ptr) {
52-
const len = new Uint32Array(buffer)[ptr + SIZE_OFFSET >>> 2] >>> 1;
53-
const arr = new Uint16Array(buffer, ptr, len);
58+
let len = new Uint32Array(buffer)[ptr + SIZE_OFFSET >>> 2] >>> 1;
59+
const wtf16 = new Uint16Array(buffer, ptr, len);
60+
if (len <= STRING_SMALLSIZE) return String.fromCharCode(...wtf16);
61+
62+
try {
63+
return utf16.decode(wtf16);
64+
} catch {
65+
let str = "",
66+
off = 0;
67+
68+
while (len - off > STRING_CHUNKSIZE) {
69+
str += String.fromCharCode(...wtf16.subarray(off, off += STRING_CHUNKSIZE));
70+
}
5471

55-
if (len <= STRING_DECODE_THRESHOLD) {
56-
return String.fromCharCode.apply(String, arr);
72+
return str + String.fromCharCode(...wtf16.subarray(off));
5773
}
58-
59-
return decoder.decode(arr);
6074
}
6175
/** Prepares the base module prior to instantiation. */
6276

@@ -110,9 +124,10 @@ var loader = (function(exports) {
110124

111125
const __collect = exports.__collect || F_NOEXPORTRUNTIME;
112126

113-
const __rtti_base = exports.__rtti_base || ~0; // oob if not present
114-
115-
127+
const __rtti_base = exports.__rtti_base;
128+
const getRttiCount = __rtti_base ? function (arr) {
129+
return arr[__rtti_base >>> 2];
130+
} : F_NOEXPORTRUNTIME;
116131
extendedExports.__new = __new;
117132
extendedExports.__pin = __pin;
118133
extendedExports.__unpin = __unpin;
@@ -121,7 +136,7 @@ var loader = (function(exports) {
121136

122137
function getInfo(id) {
123138
const U32 = new Uint32Array(memory.buffer);
124-
const count = U32[__rtti_base >>> 2];
139+
const count = getRttiCount(U32);
125140
if ((id >>>= 0) >= count) throw Error(`invalid id: ${id}`);
126141
return U32[(__rtti_base + 4 >>> 2) + id * 2];
127142
}
@@ -138,7 +153,7 @@ var loader = (function(exports) {
138153

139154
function getBase(id) {
140155
const U32 = new Uint32Array(memory.buffer);
141-
const count = U32[__rtti_base >>> 2];
156+
const count = getRttiCount(U32);
142157
if ((id >>>= 0) >= count) throw Error(`invalid id: ${id}`);
143158
return U32[(__rtti_base + 4 >>> 2) + id * 2 + 1];
144159
}
@@ -330,7 +345,7 @@ var loader = (function(exports) {
330345
const U32 = new Uint32Array(memory.buffer);
331346
let id = U32[ptr + ID_OFFSET >>> 2];
332347

333-
if (id <= U32[__rtti_base >>> 2]) {
348+
if (id <= getRttiCount(U32)) {
334349
do {
335350
if (id == baseId) return true;
336351
id = getBase(id);

std/assembly/index.d.ts

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1773,12 +1773,21 @@ declare class String {
17731773
declare namespace String {
17741774
/** Encoding helpers for UTF-8. */
17751775
export namespace UTF8 {
1776+
/** UTF-8 encoding error modes. */
1777+
export const enum ErrorMode {
1778+
/** Keeps unpaired surrogates as of WTF-8. This is the default. */
1779+
WTF8,
1780+
/** Replaces unpaired surrogates with the replacement character (U+FFFD). */
1781+
REPLACE,
1782+
/** Throws an error on unpaired surrogates. */
1783+
ERROR
1784+
}
17761785
/** Calculates the byte length of the specified string when encoded as UTF-8, optionally null terminated. */
17771786
export function byteLength(str: string, nullTerminated?: bool): i32;
1778-
/** Encodes the specified string to UTF-8 bytes, optionally null terminated. */
1779-
export function encode(str: string, nullTerminated?: bool): ArrayBuffer;
1780-
/** Encodes the specified raw string to UTF-8 bytes, opionally null terminated. Returns the number of bytes written. */
1781-
export function encodeUnsafe(str: usize, len: i32, buf: usize, nullTerminated?: bool): usize;
1787+
/** Encodes the specified string to UTF-8 bytes, optionally null terminated. ErrorMode defaults to WTF-8. */
1788+
export function encode(str: string, nullTerminated?: bool, errorMode?: ErrorMode): ArrayBuffer;
1789+
/** Encodes the specified raw string to UTF-8 bytes, opionally null terminated. ErrorMode defaults to WTF-8. Returns the number of bytes written. */
1790+
export function encodeUnsafe(str: usize, len: i32, buf: usize, nullTerminated?: bool, errorMode?: ErrorMode): usize;
17821791
/** Decodes the specified buffer from UTF-8 bytes to a string, optionally null terminated. */
17831792
export function decode(buf: ArrayBuffer, nullTerminated?: bool): string;
17841793
/** Decodes raw UTF-8 bytes to a string, optionally null terminated. */

std/assembly/string.ts

Lines changed: 33 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import { OBJECT, BLOCK_MAXSIZE, TOTAL_OVERHEAD } from "./rt/common";
44
import { compareImpl, strtol, strtod, isSpace, isAscii, isFinalSigma, toLower8, toUpper8 } from "./util/string";
55
import { SPECIALS_UPPER, casemap, bsearch } from "./util/casemap";
6-
import { E_INDEXOUTOFRANGE, E_INVALIDLENGTH } from "./util/error";
6+
import { E_INDEXOUTOFRANGE, E_INVALIDLENGTH, E_UNPAIRED_SURROGATE } from "./util/error";
77
import { idof } from "./builtins";
88
import { Array } from "./array";
99

@@ -661,6 +661,12 @@ export namespace String {
661661

662662
export namespace UTF8 {
663663

664+
export const enum ErrorMode {
665+
WTF8,
666+
REPLACE,
667+
ERROR
668+
}
669+
664670
export function byteLength(str: string, nullTerminated: bool = false): i32 {
665671
var strOff = changetype<usize>(str);
666672
var strEnd = strOff + <usize>changetype<OBJECT>(changetype<usize>(str) - TOTAL_OVERHEAD).rtSize;
@@ -687,15 +693,15 @@ export namespace String {
687693
return bufLen;
688694
}
689695

690-
export function encode(str: string, nullTerminated: bool = false): ArrayBuffer {
696+
export function encode(str: string, nullTerminated: bool = false, errorMode: ErrorMode = ErrorMode.WTF8): ArrayBuffer {
691697
var buf = changetype<ArrayBuffer>(__new(<usize>byteLength(str, nullTerminated), idof<ArrayBuffer>()));
692-
encodeUnsafe(changetype<usize>(str), str.length, changetype<usize>(buf), nullTerminated);
698+
encodeUnsafe(changetype<usize>(str), str.length, changetype<usize>(buf), nullTerminated, errorMode);
693699
return buf;
694700
}
695701

696702
// @ts-ignore: decorator
697703
@unsafe
698-
export function encodeUnsafe(str: usize, len: i32, buf: usize, nullTerminated: bool = false): usize {
704+
export function encodeUnsafe(str: usize, len: i32, buf: usize, nullTerminated: bool = false, errorMode: ErrorMode = ErrorMode.WTF8): usize {
699705
var strEnd = str + (<usize>len << 1);
700706
var bufOff = buf;
701707
while (str < strEnd) {
@@ -709,17 +715,29 @@ export namespace String {
709715
store<u16>(bufOff, b1 << 8 | b0);
710716
bufOff += 2;
711717
} else {
712-
if ((c1 & 0xFC00) == 0xD800 && str + 2 < strEnd) {
713-
let c2 = <u32>load<u16>(str, 2);
714-
if ((c2 & 0xFC00) == 0xDC00) {
715-
c1 = 0x10000 + ((c1 & 0x03FF) << 10) | (c2 & 0x03FF);
716-
let b0 = c1 >> 18 | 240;
717-
let b1 = c1 >> 12 & 63 | 128;
718-
let b2 = c1 >> 6 & 63 | 128;
719-
let b3 = c1 & 63 | 128;
720-
store<u32>(bufOff, b3 << 24 | b2 << 16 | b1 << 8 | b0);
721-
bufOff += 4; str += 4;
722-
continue;
718+
// D800: 11011 0 0000000000 Lead
719+
// DBFF: 11011 0 1111111111
720+
// DC00: 11011 1 0000000000 Trail
721+
// DFFF: 11011 1 1111111111
722+
// F800: 11111 0 0000000000 Mask
723+
// FC00: 11111 1 0000000000
724+
if ((c1 & 0xF800) == 0xD800) {
725+
if (c1 < 0xDC00 && str + 2 < strEnd) {
726+
let c2 = <u32>load<u16>(str, 2);
727+
if ((c2 & 0xFC00) == 0xDC00) {
728+
c1 = 0x10000 + ((c1 & 0x03FF) << 10) | (c2 & 0x03FF);
729+
let b0 = c1 >> 18 | 240;
730+
let b1 = c1 >> 12 & 63 | 128;
731+
let b2 = c1 >> 6 & 63 | 128;
732+
let b3 = c1 & 63 | 128;
733+
store<u32>(bufOff, b3 << 24 | b2 << 16 | b1 << 8 | b0);
734+
bufOff += 4; str += 4;
735+
continue;
736+
}
737+
}
738+
if (errorMode != ErrorMode.WTF8) { // unlikely
739+
if (errorMode == ErrorMode.ERROR) throw new Error(E_UNPAIRED_SURROGATE);
740+
c1 = 0xFFFD;
723741
}
724742
}
725743
let b0 = c1 >> 12 | 224;

std/assembly/util/error.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,3 +52,7 @@ export const E_URI_MALFORMED: string = "URI malformed";
5252
// @ts-ignore: decorator
5353
@lazy @inline
5454
export const E_INVALIDDATE: string = "Invalid Date";
55+
56+
// @ts-ignore: decorator
57+
@lazy @inline
58+
export const E_UNPAIRED_SURROGATE: string = "Unpaired surrogate";

0 commit comments

Comments
 (0)