Skip to content

Implement RegExp serialization #153

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions cutils.h
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,14 @@ static inline uint64_t bswap64(uint64_t v)
((v & ((uint64_t)0xff << (0 * 8))) << (7 * 8));
}

static inline void inplace_bswap16(uint8_t *tab) {
put_u16(tab, bswap16(get_u16(tab)));
}

static inline void inplace_bswap32(uint8_t *tab) {
put_u32(tab, bswap32(get_u32(tab)));
}

/* XXX: should take an extra argument to pass slack information to the caller */
typedef void *DynBufReallocFunc(void *opaque, void *ptr, size_t size);

Expand Down
65 changes: 65 additions & 0 deletions libregexp.c
Original file line number Diff line number Diff line change
Expand Up @@ -2557,6 +2557,71 @@ const char *lre_get_groupnames(const uint8_t *bc_buf)
return (const char *)(bc_buf + 7 + re_bytecode_len);
}

void lre_byte_swap(uint8_t *buf, size_t len, BOOL is_byte_swapped)
{
uint8_t *p, *pe;
uint32_t n, r;

p = buf;
if (len < RE_HEADER_LEN)
abort();

// format is:
// <header>
// <bytecode>
// <capture group name 1>
// <capture group name 2>
// etc.
n = get_u32(&p[3]); // bytecode size
inplace_bswap32(&p[3]);
if (is_byte_swapped)
n = bswap32(n);
if (n > len - RE_HEADER_LEN)
abort();

p = &buf[RE_HEADER_LEN];
pe = &p[n];

while (p < pe) {
n = reopcode_info[*p].size;
switch (n) {
case 1:
case 2:
break;
case 3:
switch (*p) {
case REOP_save_reset: // has two 8 bit arguments
break;
case REOP_range32: // variable length
for (r = 3 + 4 * get_u16(&p[1]); n < r; n += 4)
inplace_bswap32(&p[n]);
goto doswap16;
case REOP_range: // variable length
for (r = 3 + 2 * get_u16(&p[1]); n < r; n += 2)
inplace_bswap16(&p[n]);
goto doswap16;
default:
doswap16:
inplace_bswap16(&p[1]);
}
break;
case 5:
inplace_bswap32(&p[1]);
break;
case 17:
assert(*p == REOP_simple_greedy_quant);
inplace_bswap32(&p[1]);
inplace_bswap32(&p[5]);
inplace_bswap32(&p[9]);
inplace_bswap32(&p[13]);
break;
default:
abort();
}
p = &p[n];
}
}

#ifdef TEST

BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size)
Expand Down
2 changes: 2 additions & 0 deletions libregexp.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ int lre_exec(uint8_t **capture,
int lre_parse_escape(const uint8_t **pp, int allow_utf16);
LRE_BOOL lre_is_space(int c);

void lre_byte_swap(uint8_t *buf, size_t len, BOOL is_byte_swapped);

/* must be provided by the user */
LRE_BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size);
void *lre_realloc(void *opaque, void *ptr, size_t size);
Expand Down
51 changes: 51 additions & 0 deletions quickjs.c
Original file line number Diff line number Diff line change
Expand Up @@ -31659,6 +31659,7 @@ typedef enum BCTagEnum {
BC_TAG_TYPED_ARRAY,
BC_TAG_ARRAY_BUFFER,
BC_TAG_SHARED_ARRAY_BUFFER,
BC_TAG_REGEXP,
BC_TAG_DATE,
BC_TAG_OBJECT_VALUE,
BC_TAG_OBJECT_REFERENCE,
Expand Down Expand Up @@ -32272,6 +32273,24 @@ static int JS_WriteSharedArrayBuffer(BCWriterState *s, JSValueConst obj)
return 0;
}

static int JS_WriteRegExp(BCWriterState *s, JSRegExp regexp)
{
JSString *bc = regexp.bytecode;
assert(!bc->is_wide_char);

JS_WriteString(s, regexp.pattern);

if (is_be())
lre_byte_swap(bc->u.str8, bc->len, /*is_byte_swapped*/FALSE);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure I follow, why are we calling lre_byte_swap twice, first with false then true?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is_byte_swapped tells lre_byte_swap whether the size field in the header is in host endianness or not, i.e., whether it needs to byte-swap it. It reads that in order to know how large the bytecode is, because bc->len == header + bytecode + group names.

Why twice? Do and undo: convert from BE to LE, write out data, convert back to BE.


JS_WriteString(s, bc);

if (is_be())
lre_byte_swap(bc->u.str8, bc->len, /*is_byte_swapped*/TRUE);

return 0;
}

static int JS_WriteObjectRec(BCWriterState *s, JSValueConst obj)
{
uint32_t tag;
Expand Down Expand Up @@ -32360,6 +32379,10 @@ static int JS_WriteObjectRec(BCWriterState *s, JSValueConst obj)
goto invalid_tag;
ret = JS_WriteSharedArrayBuffer(s, obj);
break;
case JS_CLASS_REGEXP:
bc_put_u8(s, BC_TAG_REGEXP);
ret = JS_WriteRegExp(s, p->u.regexp);
break;
case JS_CLASS_DATE:
bc_put_u8(s, BC_TAG_DATE);
ret = JS_WriteObjectRec(s, p->u.object_data);
Expand Down Expand Up @@ -33357,6 +33380,31 @@ static JSValue JS_ReadSharedArrayBuffer(BCReaderState *s)
return JS_EXCEPTION;
}

static JSValue JS_ReadRegExp(BCReaderState *s)
{
JSContext *ctx = s->ctx;
JSString *pattern;
JSString *bc;

pattern = JS_ReadString(s);
if (!pattern)
return JS_EXCEPTION;

bc = JS_ReadString(s);
if (!bc) {
js_free_string(ctx->rt, pattern);
return JS_EXCEPTION;
}

assert(!bc->is_wide_char);
if (is_be())
lre_byte_swap(bc->u.str8, bc->len, /*is_byte_swapped*/TRUE);

return js_regexp_constructor_internal(ctx, JS_UNDEFINED,
JS_MKPTR(JS_TAG_STRING, pattern),
JS_MKPTR(JS_TAG_STRING, bc));
}

static JSValue JS_ReadDate(BCReaderState *s)
{
JSContext *ctx = s->ctx;
Expand Down Expand Up @@ -33484,6 +33532,9 @@ static JSValue JS_ReadObjectRec(BCReaderState *s)
goto invalid_tag;
obj = JS_ReadSharedArrayBuffer(s);
break;
case BC_TAG_REGEXP:
obj = JS_ReadRegExp(s);
break;
case BC_TAG_DATE:
obj = JS_ReadDate(s);
break;
Expand Down
13 changes: 13 additions & 0 deletions tests/test_bjson.js
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,18 @@ function bjson_test_reference()
}
}

function bjson_test_regexp()
{
var buf, r;

bjson_test(/xyzzy/);
bjson_test(/xyzzy/digu);

buf = bjson.write(/(?<𝓓𝓸𝓰>dog)/);
r = bjson.read(buf, 0, buf.byteLength);
assert("sup dog".match(r).groups["𝓓𝓸𝓰"], "dog");
}

function bjson_test_all()
{
var obj;
Expand Down Expand Up @@ -171,6 +183,7 @@ function bjson_test_all()
}

bjson_test_reference();
bjson_test_regexp();
}

bjson_test_all();