From ee3a15a12f56f4367abf2e469da8b5333cef5358 Mon Sep 17 00:00:00 2001 From: Larry Ewing Date: Tue, 6 May 2025 13:23:04 -0500 Subject: [PATCH 01/24] Implement i8 jiterp shuffle and simplify logic --- .../runtime/jiterpreter-trace-generator.ts | 90 +++++++------------ 1 file changed, 30 insertions(+), 60 deletions(-) diff --git a/src/mono/browser/runtime/jiterpreter-trace-generator.ts b/src/mono/browser/runtime/jiterpreter-trace-generator.ts index 8da5b65fad7044..6a0f02f7e0b199 100644 --- a/src/mono/browser/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/browser/runtime/jiterpreter-trace-generator.ts @@ -3837,34 +3837,10 @@ function emit_simd_3 (builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrin append_stloc_tail(builder, getArgU16(ip, 1), WasmOpcode.i32_store); return true; } - case SimdIntrinsic3.V128_I1_SHUFFLE: { - // Detect a constant indices vector and turn it into a const. This allows - // v8 to use a more optimized implementation of the swizzle opcode - const indicesOffset = getArgU16(ip, 3), - constantIndices = get_known_constant_value(builder, indicesOffset); - - // Pre-load destination ptr - builder.local("pLocals"); - // Load vec - append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); - - if (typeof (constantIndices) === "object") { - // HACK: Use the known constant vector directly instead of loading it from memory. - builder.appendSimd(WasmSimdOpcode.v128_const); - builder.appendBytes(constantIndices); - } else { - // Load the indices from memory - append_ldloc(builder, indicesOffset, WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); - } - - // we now have two vectors on the stack, the values and the byte indices - builder.appendSimd(WasmSimdOpcode.i8x16_swizzle); - append_simd_store(builder, ip); - return true; - } + case SimdIntrinsic3.V128_I1_SHUFFLE: case SimdIntrinsic3.V128_I2_SHUFFLE: case SimdIntrinsic3.V128_I4_SHUFFLE: - // FIXME: I8 + case SimdIntrinsic3.V128_I8_SHUFFLE: return emit_shuffle(builder, ip, index === SimdIntrinsic3.V128_I2_SHUFFLE ? 8 : 4); default: return false; @@ -3879,19 +3855,21 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu const elementSize = 16 / elementCount, indicesOffset = getArgU16(ip, 3), constantIndices = get_known_constant_value(builder, indicesOffset); - mono_assert((elementSize === 2) || (elementSize === 4), "Unsupported shuffle element size"); // Pre-load destination ptr builder.local("pLocals"); // Load vec append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); - if (typeof (constantIndices) === "object") { + if (typeof (constantIndices) === "object" && (elementSize <= 4)) { // HACK: We have a known constant shuffle vector with char or int indices. Expand it to // byte indices and then embed a new constant in the trace. const newShuffleVector = new Uint8Array(sizeOfV128), - nativeIndices = (elementSize === 2) - ? new Uint16Array(constantIndices.buffer, constantIndices.byteOffset, elementCount) - : new Uint32Array(constantIndices.buffer, constantIndices.byteOffset, elementCount); + nativeIndices = (elementSize === 1) ? + new Uint8Array(constantIndices.buffer, constantIndices.byteOffset, elementCount) + : (elementSize === 2) ? + new Uint16Array(constantIndices.buffer, constantIndices.byteOffset, elementCount) + : new Uint32Array(constantIndices.buffer, constantIndices.byteOffset, elementCount); + for (let i = 0, k = 0; i < elementCount; i++, k += elementSize) { const elementIndex = nativeIndices[i]; for (let j = 0; j < elementSize; j++) @@ -3903,36 +3881,28 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu } else { // Load indices (in chars) append_ldloc(builder, indicesOffset, WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); - // There's no direct narrowing opcode for i32 -> i8, so we have to do two steps :( - if (elementCount === 4) { - // i32{lane0 ... lane3} -> i16{lane0 ... lane3, 0 ...} - builder.v128_const(0); - builder.appendSimd(WasmSimdOpcode.i16x8_narrow_i32x4_u); - } - // Load a zero vector (narrow takes two vectors) - builder.v128_const(0); - // i16{lane0 ... lane7} -> i8{lane0 ... lane7, 0 ...} - builder.appendSimd(WasmSimdOpcode.i8x16_narrow_i16x8_u); - // i8{0, 1, 2, 3 ...} -> i8{0, 0, 1, 1, 2, 2, 3, 3 ...} - builder.appendSimd(WasmSimdOpcode.v128_const); - for (let i = 0; i < elementCount; i++) { - for (let j = 0; j < elementSize; j++) - builder.appendU8(i); - } - builder.appendSimd(WasmSimdOpcode.i8x16_swizzle); - // multiply indices by 2 or 4 to scale from elt indices to byte indices - builder.i32_const(elementCount === 4 ? 2 : 1); - builder.appendSimd(WasmSimdOpcode.i8x16_shl); - // now add an offset to the additional bytes of each lane, i.e. - // 0 1 2 3 0 1 2 3 ... - builder.appendSimd(WasmSimdOpcode.v128_const); - for (let i = 0; i < elementCount; i++) { - for (let j = 0; j < elementSize; j++) - builder.appendU8(j); + if (elementCount < 16) { + const shift = elementCount === 8 ? 1 : elementCount === 4 ? 2 : 3; + const stride = 16 / elementCount; + + builder.i32_const(shift); + builder.appendSimd(WasmSimdOpcode.i8x16_shl); + + builder.appendSimd(WasmSimdOpcode.v128_const); + for (let i = 0; i < elementCount; i++) { + for (let j = 0; j < elementSize; j++) + builder.appendU8(i * stride); + } + builder.appendSimd(WasmSimdOpcode.i8x16_swizzle); + + builder.appendSimd(WasmSimdOpcode.v128_const); + for (let i = 0; i < elementCount; i++) { + for (let j = 0; j < elementSize; j++) + builder.appendU8(j); + } + + builder.appendSimd(WasmSimdOpcode.v128_or); } - // we can do a bitwise or since we know we previously multiplied all the lanes by 2 or 4, - // so the 1 and 2 bits are already clear - builder.appendSimd(WasmSimdOpcode.v128_or); } // we now have two vectors on the stack, the values and the byte indices builder.appendSimd(WasmSimdOpcode.i8x16_swizzle); From 539d8496e645d094d6791d74aeab8c712e32b4a2 Mon Sep 17 00:00:00 2001 From: Larry Ewing Date: Tue, 6 May 2025 13:31:36 -0500 Subject: [PATCH 02/24] Fix the element count calc --- .../browser/runtime/jiterpreter-trace-generator.ts | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/mono/browser/runtime/jiterpreter-trace-generator.ts b/src/mono/browser/runtime/jiterpreter-trace-generator.ts index 6a0f02f7e0b199..28e14df11849af 100644 --- a/src/mono/browser/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/browser/runtime/jiterpreter-trace-generator.ts @@ -3841,7 +3841,15 @@ function emit_simd_3 (builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrin case SimdIntrinsic3.V128_I2_SHUFFLE: case SimdIntrinsic3.V128_I4_SHUFFLE: case SimdIntrinsic3.V128_I8_SHUFFLE: - return emit_shuffle(builder, ip, index === SimdIntrinsic3.V128_I2_SHUFFLE ? 8 : 4); + let elementCount = 16; + if (index === SimdIntrinsic3.V128_I2_SHUFFLE) + elementCount = 8; + else if (index === SimdIntrinsic3.V128_I4_SHUFFLE) + elementCount = 4; + else if (index === SimdIntrinsic3.V128_I8_SHUFFLE) + elementCount = 2; + + return emit_shuffle(builder, ip, elementCount); default: return false; } From 1f1f2936abc20677e03d1251970ca772e6e5b493 Mon Sep 17 00:00:00 2001 From: Larry Ewing Date: Tue, 6 May 2025 13:33:36 -0500 Subject: [PATCH 03/24] remove extra var --- src/mono/browser/runtime/jiterpreter-trace-generator.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/mono/browser/runtime/jiterpreter-trace-generator.ts b/src/mono/browser/runtime/jiterpreter-trace-generator.ts index 28e14df11849af..dd8c5e41ce5c1b 100644 --- a/src/mono/browser/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/browser/runtime/jiterpreter-trace-generator.ts @@ -3891,7 +3891,6 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu append_ldloc(builder, indicesOffset, WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); if (elementCount < 16) { const shift = elementCount === 8 ? 1 : elementCount === 4 ? 2 : 3; - const stride = 16 / elementCount; builder.i32_const(shift); builder.appendSimd(WasmSimdOpcode.i8x16_shl); @@ -3899,7 +3898,7 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu builder.appendSimd(WasmSimdOpcode.v128_const); for (let i = 0; i < elementCount; i++) { for (let j = 0; j < elementSize; j++) - builder.appendU8(i * stride); + builder.appendU8(i * elementSize); } builder.appendSimd(WasmSimdOpcode.i8x16_swizzle); From f96d3564b384a68fd52323b724f7a5a17c3e5339 Mon Sep 17 00:00:00 2001 From: Larry Ewing Date: Tue, 6 May 2025 16:31:32 -0500 Subject: [PATCH 04/24] More changes --- .../runtime/jiterpreter-trace-generator.ts | 54 ++++++++++++++----- 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/src/mono/browser/runtime/jiterpreter-trace-generator.ts b/src/mono/browser/runtime/jiterpreter-trace-generator.ts index dd8c5e41ce5c1b..616dbca5391419 100644 --- a/src/mono/browser/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/browser/runtime/jiterpreter-trace-generator.ts @@ -3840,7 +3840,7 @@ function emit_simd_3 (builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrin case SimdIntrinsic3.V128_I1_SHUFFLE: case SimdIntrinsic3.V128_I2_SHUFFLE: case SimdIntrinsic3.V128_I4_SHUFFLE: - case SimdIntrinsic3.V128_I8_SHUFFLE: + case SimdIntrinsic3.V128_I8_SHUFFLE: { let elementCount = 16; if (index === SimdIntrinsic3.V128_I2_SHUFFLE) elementCount = 8; @@ -3849,7 +3849,8 @@ function emit_simd_3 (builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrin else if (index === SimdIntrinsic3.V128_I8_SHUFFLE) elementCount = 2; - return emit_shuffle(builder, ip, elementCount); + return emit_shuffle(builder, ip, elementCount, true); + } default: return false; } @@ -3857,9 +3858,9 @@ function emit_simd_3 (builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrin return false; } -// implement i16 and i32 shuffles on top of wasm's only shuffle opcode by expanding the -// element shuffle indices into byte indices -function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: number): boolean { +// implement shuffles on top of wasm's swizzle opcode by expanding the +// element shuffle indices into byte indices +function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: number, check_bounds: boolean): boolean { const elementSize = 16 / elementCount, indicesOffset = getArgU16(ip, 3), constantIndices = get_known_constant_value(builder, indicesOffset); @@ -3868,20 +3869,26 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu builder.local("pLocals"); // Load vec append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); - if (typeof (constantIndices) === "object" && (elementSize <= 4)) { - // HACK: We have a known constant shuffle vector with char or int indices. Expand it to + if (typeof (constantIndices) === "object") { + // HACK: We have a known constant shuffle vector indices. Expand it to // byte indices and then embed a new constant in the trace. const newShuffleVector = new Uint8Array(sizeOfV128), - nativeIndices = (elementSize === 1) ? - new Uint8Array(constantIndices.buffer, constantIndices.byteOffset, elementCount) - : (elementSize === 2) ? - new Uint16Array(constantIndices.buffer, constantIndices.byteOffset, elementCount) - : new Uint32Array(constantIndices.buffer, constantIndices.byteOffset, elementCount); + nativeIndices = new Uint8Array(constantIndices.buffer, constantIndices.byteOffset, elementCount); for (let i = 0, k = 0; i < elementCount; i++, k += elementSize) { - const elementIndex = nativeIndices[i]; - for (let j = 0; j < elementSize; j++) + let elementIndex = 0; + for (let j = 0; j < elementSize; j++) { + if (j == 0) + elementIndex = nativeIndices[k + j]; + else if (nativeIndices[k + j] > 0) { + // this an invalid index, set it to zero + elementIndex = 0; + break; + } + } + for (let j = 0; j < elementSize; j++) { newShuffleVector[k + j] = (elementIndex * elementSize) + j; + } } // console.log(`shuffle w/element size ${elementSize} with constant indices ${nativeIndices} (${constantIndices}) -> byte indices ${newShuffleVector}`); builder.appendSimd(WasmSimdOpcode.v128_const); @@ -3892,6 +3899,25 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu if (elementCount < 16) { const shift = elementCount === 8 ? 1 : elementCount === 4 ? 2 : 3; + if (check_bounds) { + builder.local("shuffle_indices", WasmOpcode.tee_local); + + builder.appendSimd(WasmSimdOpcode.v128_const); + for (let i = 0; i < elementCount; i++) { + builder.appendU8(elementSize - 1); + } + if (elementSize === 2) + builder.appendSimd(WasmSimdOpcode.i16x8_ge_u); + else if (elementSize === 4) + builder.appendSimd(WasmSimdOpcode.i32x4_ge_u); + else if (elementSize === 8) { + builder.appendSimd(WasmSimdOpcode.i64x2_ge_s); + } + + builder.local("shuffle_indices"); + builder.appendSimd(WasmSimdOpcode.v128_or); + } + builder.i32_const(shift); builder.appendSimd(WasmSimdOpcode.i8x16_shl); From 1a768127ced86e2ca20dfa077b777bdeefa1057f Mon Sep 17 00:00:00 2001 From: Larry Ewing Date: Tue, 6 May 2025 17:48:44 -0500 Subject: [PATCH 05/24] Rework slightly --- .../runtime/jiterpreter-trace-generator.ts | 40 ++++++++++--------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/src/mono/browser/runtime/jiterpreter-trace-generator.ts b/src/mono/browser/runtime/jiterpreter-trace-generator.ts index 616dbca5391419..34b241854c2ad3 100644 --- a/src/mono/browser/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/browser/runtime/jiterpreter-trace-generator.ts @@ -3860,7 +3860,7 @@ function emit_simd_3 (builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrin // implement shuffles on top of wasm's swizzle opcode by expanding the // element shuffle indices into byte indices -function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: number, check_bounds: boolean): boolean { +function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: number, checkBounds: boolean): boolean { const elementSize = 16 / elementCount, indicesOffset = getArgU16(ip, 3), constantIndices = get_known_constant_value(builder, indicesOffset); @@ -3872,34 +3872,36 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu if (typeof (constantIndices) === "object") { // HACK: We have a known constant shuffle vector indices. Expand it to // byte indices and then embed a new constant in the trace. - const newShuffleVector = new Uint8Array(sizeOfV128), - nativeIndices = new Uint8Array(constantIndices.buffer, constantIndices.byteOffset, elementCount); - - for (let i = 0, k = 0; i < elementCount; i++, k += elementSize) { - let elementIndex = 0; - for (let j = 0; j < elementSize; j++) { - if (j == 0) - elementIndex = nativeIndices[k + j]; - else if (nativeIndices[k + j] > 0) { - // this an invalid index, set it to zero - elementIndex = 0; - break; + let nativeIndices = new Uint8Array(constantIndices.buffer, constantIndices.byteOffset, elementCount); + if (elementCount !== 16) { + const newShuffleVector = new Uint8Array(sizeOfV128); + for (let i = 0, k = 0; i < elementCount; i++, k += elementSize) { + let elementIndex = 0; + for (let j = 0; j < elementSize; j++) { + if (j == 0) + elementIndex = nativeIndices[k + j]; + else if (nativeIndices[k + j] > 0) { + // this an invalid index, set it to zero + elementIndex = 0; + break; + } + } + for (let j = 0; j < elementSize; j++) { + newShuffleVector[k + j] = (elementIndex * elementSize) + j; } } - for (let j = 0; j < elementSize; j++) { - newShuffleVector[k + j] = (elementIndex * elementSize) + j; - } + nativeIndices = newShuffleVector; } // console.log(`shuffle w/element size ${elementSize} with constant indices ${nativeIndices} (${constantIndices}) -> byte indices ${newShuffleVector}`); builder.appendSimd(WasmSimdOpcode.v128_const); - builder.appendBytes(newShuffleVector); + builder.appendBytes(nativeIndices); } else { // Load indices (in chars) append_ldloc(builder, indicesOffset, WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); - if (elementCount < 16) { + if (elementCount !== 16) { const shift = elementCount === 8 ? 1 : elementCount === 4 ? 2 : 3; - if (check_bounds) { + if (checkBounds) { builder.local("shuffle_indices", WasmOpcode.tee_local); builder.appendSimd(WasmSimdOpcode.v128_const); From 74d3ceb898c81263429c4288890efaa477cd223b Mon Sep 17 00:00:00 2001 From: Larry Ewing Date: Tue, 6 May 2025 17:55:27 -0500 Subject: [PATCH 06/24] Fix condition --- src/mono/browser/runtime/jiterpreter-trace-generator.ts | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/mono/browser/runtime/jiterpreter-trace-generator.ts b/src/mono/browser/runtime/jiterpreter-trace-generator.ts index 34b241854c2ad3..92eb444546578b 100644 --- a/src/mono/browser/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/browser/runtime/jiterpreter-trace-generator.ts @@ -3909,11 +3909,11 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu builder.appendU8(elementSize - 1); } if (elementSize === 2) - builder.appendSimd(WasmSimdOpcode.i16x8_ge_u); + builder.appendSimd(WasmSimdOpcode.i16x8_gt_u); else if (elementSize === 4) - builder.appendSimd(WasmSimdOpcode.i32x4_ge_u); + builder.appendSimd(WasmSimdOpcode.i32x4_gt_u); else if (elementSize === 8) { - builder.appendSimd(WasmSimdOpcode.i64x2_ge_s); + builder.appendSimd(WasmSimdOpcode.i64x2_gt_s); } builder.local("shuffle_indices"); @@ -3935,7 +3935,6 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu for (let j = 0; j < elementSize; j++) builder.appendU8(j); } - builder.appendSimd(WasmSimdOpcode.v128_or); } } From 4e0a4257620c8e0acd2037463ff6052038e8074a Mon Sep 17 00:00:00 2001 From: Larry Ewing Date: Tue, 6 May 2025 21:25:39 -0500 Subject: [PATCH 07/24] use safe invalid --- src/mono/browser/runtime/jiterpreter-trace-generator.ts | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/mono/browser/runtime/jiterpreter-trace-generator.ts b/src/mono/browser/runtime/jiterpreter-trace-generator.ts index 92eb444546578b..489a457a8acdd7 100644 --- a/src/mono/browser/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/browser/runtime/jiterpreter-trace-generator.ts @@ -3880,9 +3880,10 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu for (let j = 0; j < elementSize; j++) { if (j == 0) elementIndex = nativeIndices[k + j]; - else if (nativeIndices[k + j] > 0) { - // this an invalid index, set it to zero - elementIndex = 0; + + if (elementIndex >= elementCount || nativeIndices[k + j] != 0) { + // this an invalid index make invalid in a safe way + elementIndex = elementCount; break; } } From d7a77a51a9e9122fe55faff57bd7644961ba1618 Mon Sep 17 00:00:00 2001 From: Larry Ewing Date: Tue, 6 May 2025 21:29:30 -0500 Subject: [PATCH 08/24] use the correct max --- src/mono/browser/runtime/jiterpreter-trace-generator.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mono/browser/runtime/jiterpreter-trace-generator.ts b/src/mono/browser/runtime/jiterpreter-trace-generator.ts index 489a457a8acdd7..b07378d7cc909b 100644 --- a/src/mono/browser/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/browser/runtime/jiterpreter-trace-generator.ts @@ -3907,7 +3907,7 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu builder.appendSimd(WasmSimdOpcode.v128_const); for (let i = 0; i < elementCount; i++) { - builder.appendU8(elementSize - 1); + builder.appendU8(elementCount - 1); } if (elementSize === 2) builder.appendSimd(WasmSimdOpcode.i16x8_gt_u); From 45d8fe19286e6bdb6aad64e91375d68eead5afef Mon Sep 17 00:00:00 2001 From: Larry Ewing Date: Wed, 7 May 2025 11:30:18 -0500 Subject: [PATCH 09/24] fix the min condition in the const path --- src/mono/browser/runtime/jiterpreter-trace-generator.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mono/browser/runtime/jiterpreter-trace-generator.ts b/src/mono/browser/runtime/jiterpreter-trace-generator.ts index b07378d7cc909b..62008290c1dc87 100644 --- a/src/mono/browser/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/browser/runtime/jiterpreter-trace-generator.ts @@ -3881,7 +3881,7 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu if (j == 0) elementIndex = nativeIndices[k + j]; - if (elementIndex >= elementCount || nativeIndices[k + j] != 0) { + if (elementIndex >= elementCount || (j > 0 && nativeIndices[k + j] != 0)) { // this an invalid index make invalid in a safe way elementIndex = elementCount; break; From c8c8fdcc3f3b788efaa7a0038e374467ea93a7f9 Mon Sep 17 00:00:00 2001 From: Larry Ewing Date: Wed, 7 May 2025 14:41:02 -0500 Subject: [PATCH 10/24] Add comments and clean up implementation slightly --- .../runtime/jiterpreter-trace-generator.ts | 47 ++++++++++++------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/src/mono/browser/runtime/jiterpreter-trace-generator.ts b/src/mono/browser/runtime/jiterpreter-trace-generator.ts index 62008290c1dc87..2de84415d40f20 100644 --- a/src/mono/browser/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/browser/runtime/jiterpreter-trace-generator.ts @@ -3876,19 +3876,22 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu if (elementCount !== 16) { const newShuffleVector = new Uint8Array(sizeOfV128); for (let i = 0, k = 0; i < elementCount; i++, k += elementSize) { - let elementIndex = 0; - for (let j = 0; j < elementSize; j++) { - if (j == 0) - elementIndex = nativeIndices[k + j]; - - if (elementIndex >= elementCount || (j > 0 && nativeIndices[k + j] != 0)) { - // this an invalid index make invalid in a safe way + // The indices are lane sized but are only valid when less than + // elementCount so we load them as bytes, read the first byte for + // the value then invalidate the value if any of the other bytes are != 0. + let elementIndex = nativeIndices[k]; + + // check the remaining bytes of the element for > 0 which also invalidates the index + for (let j = 1; j < elementSize && elementIndex < elementCount; j++) { + if (nativeIndices[k + j] > 0) { elementIndex = elementCount; - break; } } + for (let j = 0; j < elementSize; j++) { - newShuffleVector[k + j] = (elementIndex * elementSize) + j; + // we use min elementCount to force invalid indices to fit in a byte which + // the intrinsic will handle by zeroing the lane + newShuffleVector[k + j] = Math.min(elementIndex * elementSize + j, elementCount); } } nativeIndices = newShuffleVector; @@ -3902,28 +3905,37 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu if (elementCount !== 16) { const shift = elementCount === 8 ? 1 : elementCount === 4 ? 2 : 3; + // Like above the indices are lane sized but only valid when less than elementCount + // so we check for that and invalidate the index in that case by forcing it + // to an invalid value for Vecor128.Shuffle. if (checkBounds) { - builder.local("shuffle_indices", WasmOpcode.tee_local); + if (elementSize === 8) + builder.local("shuffle_indices", WasmOpcode.tee_local); builder.appendSimd(WasmSimdOpcode.v128_const); for (let i = 0; i < elementCount; i++) { - builder.appendU8(elementCount - 1); + builder.appendU8(elementCount); } + if (elementSize === 2) - builder.appendSimd(WasmSimdOpcode.i16x8_gt_u); + builder.appendSimd(WasmSimdOpcode.i16x8_min_u); else if (elementSize === 4) - builder.appendSimd(WasmSimdOpcode.i32x4_gt_u); + builder.appendSimd(WasmSimdOpcode.i32x4_min_u); else if (elementSize === 8) { - builder.appendSimd(WasmSimdOpcode.i64x2_gt_s); + builder.appendSimd(WasmSimdOpcode.i64x2_ge_s); + builder.local("shuffle_indices"); + builder.appendSimd(WasmSimdOpcode.v128_or); } - - builder.local("shuffle_indices"); - builder.appendSimd(WasmSimdOpcode.v128_or); } + // We need to convert lane indices to byte indices so we can + // use the swizzle opcode: + // 1: multiply the indices by elementSize using shl to + // get the byte offset of the first byte in the expanded land builder.i32_const(shift); builder.appendSimd(WasmSimdOpcode.i8x16_shl); + // 2: fill all the byte elements of the lane with the shifted values builder.appendSimd(WasmSimdOpcode.v128_const); for (let i = 0; i < elementCount; i++) { for (let j = 0; j < elementSize; j++) @@ -3931,6 +3943,7 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu } builder.appendSimd(WasmSimdOpcode.i8x16_swizzle); + // 3: Or the shifted values with the byte offset inside the lane builder.appendSimd(WasmSimdOpcode.v128_const); for (let i = 0; i < elementCount; i++) { for (let j = 0; j < elementSize; j++) From 840a256533c5a3961dde3d45f27fd01c1ecb1357 Mon Sep 17 00:00:00 2001 From: Larry Ewing Date: Wed, 7 May 2025 14:51:31 -0500 Subject: [PATCH 11/24] Remove the incomplete boundsCheck logic for now --- .../runtime/jiterpreter-trace-generator.ts | 28 ++----------------- 1 file changed, 2 insertions(+), 26 deletions(-) diff --git a/src/mono/browser/runtime/jiterpreter-trace-generator.ts b/src/mono/browser/runtime/jiterpreter-trace-generator.ts index 2de84415d40f20..61cb226857da6c 100644 --- a/src/mono/browser/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/browser/runtime/jiterpreter-trace-generator.ts @@ -3849,7 +3849,7 @@ function emit_simd_3 (builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrin else if (index === SimdIntrinsic3.V128_I8_SHUFFLE) elementCount = 2; - return emit_shuffle(builder, ip, elementCount, true); + return emit_shuffle(builder, ip, elementCount); } default: return false; @@ -3860,7 +3860,7 @@ function emit_simd_3 (builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrin // implement shuffles on top of wasm's swizzle opcode by expanding the // element shuffle indices into byte indices -function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: number, checkBounds: boolean): boolean { +function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: number): boolean { const elementSize = 16 / elementCount, indicesOffset = getArgU16(ip, 3), constantIndices = get_known_constant_value(builder, indicesOffset); @@ -3904,30 +3904,6 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu append_ldloc(builder, indicesOffset, WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); if (elementCount !== 16) { const shift = elementCount === 8 ? 1 : elementCount === 4 ? 2 : 3; - - // Like above the indices are lane sized but only valid when less than elementCount - // so we check for that and invalidate the index in that case by forcing it - // to an invalid value for Vecor128.Shuffle. - if (checkBounds) { - if (elementSize === 8) - builder.local("shuffle_indices", WasmOpcode.tee_local); - - builder.appendSimd(WasmSimdOpcode.v128_const); - for (let i = 0; i < elementCount; i++) { - builder.appendU8(elementCount); - } - - if (elementSize === 2) - builder.appendSimd(WasmSimdOpcode.i16x8_min_u); - else if (elementSize === 4) - builder.appendSimd(WasmSimdOpcode.i32x4_min_u); - else if (elementSize === 8) { - builder.appendSimd(WasmSimdOpcode.i64x2_ge_s); - builder.local("shuffle_indices"); - builder.appendSimd(WasmSimdOpcode.v128_or); - } - } - // We need to convert lane indices to byte indices so we can // use the swizzle opcode: // 1: multiply the indices by elementSize using shl to From efbb01cfddd8561df505663afbe46b5e4bd7df12 Mon Sep 17 00:00:00 2001 From: Larry Ewing Date: Wed, 7 May 2025 14:58:00 -0500 Subject: [PATCH 12/24] Update comment --- src/mono/browser/runtime/jiterpreter-trace-generator.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mono/browser/runtime/jiterpreter-trace-generator.ts b/src/mono/browser/runtime/jiterpreter-trace-generator.ts index 61cb226857da6c..ccc552b891e117 100644 --- a/src/mono/browser/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/browser/runtime/jiterpreter-trace-generator.ts @@ -3900,7 +3900,7 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu builder.appendSimd(WasmSimdOpcode.v128_const); builder.appendBytes(nativeIndices); } else { - // Load indices (in chars) + // Load indices as v128 append_ldloc(builder, indicesOffset, WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); if (elementCount !== 16) { const shift = elementCount === 8 ? 1 : elementCount === 4 ? 2 : 3; From d191e375264915cbb4cf62ea83fcc3140db4d138 Mon Sep 17 00:00:00 2001 From: Larry Ewing Date: Wed, 7 May 2025 15:56:03 -0500 Subject: [PATCH 13/24] Update comments for clarity --- .../runtime/jiterpreter-trace-generator.ts | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/mono/browser/runtime/jiterpreter-trace-generator.ts b/src/mono/browser/runtime/jiterpreter-trace-generator.ts index ccc552b891e117..b90056cefbe2e7 100644 --- a/src/mono/browser/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/browser/runtime/jiterpreter-trace-generator.ts @@ -3889,8 +3889,11 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu } for (let j = 0; j < elementSize; j++) { - // we use min elementCount to force invalid indices to fit in a byte which - // the intrinsic will handle by zeroing the lane + // The shuffle vector is lane sized but the swizzle opcode needs byte indices + // so we multiply the lane index by elementSize to get the byte offset then add + // the offset of the byte inside the lane. Since we are using lanes as indices + // we need to check if the lane index is valid. We use min elementCount to force + // invalid indices to fit in the byte sized land and let the intrinsic handle it. newShuffleVector[k + j] = Math.min(elementIndex * elementSize + j, elementCount); } } @@ -3906,12 +3909,14 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu const shift = elementCount === 8 ? 1 : elementCount === 4 ? 2 : 3; // We need to convert lane indices to byte indices so we can // use the swizzle opcode: - // 1: multiply the indices by elementSize using shl to - // get the byte offset of the first byte in the expanded land + // + // 1: multiply the lane indices by elementSize using shl to + // get the byte offset of the first byte in the 16 lanes builder.i32_const(shift); builder.appendSimd(WasmSimdOpcode.i8x16_shl); - // 2: fill all the byte elements of the lane with the shifted values + // 2: create a vector to swizzle the shifted first byte + // of each lane into every byte of that lane. builder.appendSimd(WasmSimdOpcode.v128_const); for (let i = 0; i < elementCount; i++) { for (let j = 0; j < elementSize; j++) @@ -3919,7 +3924,8 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu } builder.appendSimd(WasmSimdOpcode.i8x16_swizzle); - // 3: Or the shifted values with the byte offset inside the lane + // 3: create a vector with the offset of each byte inside each + // lane then Or it with the now shifted and swizzled indices. builder.appendSimd(WasmSimdOpcode.v128_const); for (let i = 0; i < elementCount; i++) { for (let j = 0; j < elementSize; j++) From b46d94998f72b238311ad14dbe8efb831e0ba657 Mon Sep 17 00:00:00 2001 From: Larry Ewing Date: Wed, 7 May 2025 16:00:14 -0500 Subject: [PATCH 14/24] Update comments for clarity --- src/mono/browser/runtime/jiterpreter-trace-generator.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mono/browser/runtime/jiterpreter-trace-generator.ts b/src/mono/browser/runtime/jiterpreter-trace-generator.ts index b90056cefbe2e7..07c5b1be235a8e 100644 --- a/src/mono/browser/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/browser/runtime/jiterpreter-trace-generator.ts @@ -3915,7 +3915,7 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu builder.i32_const(shift); builder.appendSimd(WasmSimdOpcode.i8x16_shl); - // 2: create a vector to swizzle the shifted first byte + // 2: create a vector to swizzle the now shifted first byte // of each lane into every byte of that lane. builder.appendSimd(WasmSimdOpcode.v128_const); for (let i = 0; i < elementCount; i++) { @@ -3926,6 +3926,7 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu // 3: create a vector with the offset of each byte inside each // lane then Or it with the now shifted and swizzled indices. + // It is safe to use Or directly thanks to the previous shift builder.appendSimd(WasmSimdOpcode.v128_const); for (let i = 0; i < elementCount; i++) { for (let j = 0; j < elementSize; j++) From beb74f64ca3dccd2baaa5f57d27a9545dbd2c00c Mon Sep 17 00:00:00 2001 From: Larry Ewing Date: Wed, 7 May 2025 16:02:33 -0500 Subject: [PATCH 15/24] Update comments for clarity --- src/mono/browser/runtime/jiterpreter-trace-generator.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mono/browser/runtime/jiterpreter-trace-generator.ts b/src/mono/browser/runtime/jiterpreter-trace-generator.ts index 07c5b1be235a8e..228e00747fb651 100644 --- a/src/mono/browser/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/browser/runtime/jiterpreter-trace-generator.ts @@ -3893,7 +3893,7 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu // so we multiply the lane index by elementSize to get the byte offset then add // the offset of the byte inside the lane. Since we are using lanes as indices // we need to check if the lane index is valid. We use min elementCount to force - // invalid indices to fit in the byte sized land and let the intrinsic handle it. + // invalid indices to fit in the byte sized lane and let the intrinsic handle it. newShuffleVector[k + j] = Math.min(elementIndex * elementSize + j, elementCount); } } From 78b21fad0c4d475cda70a0a8ed18a086273a35ea Mon Sep 17 00:00:00 2001 From: Larry Ewing Date: Wed, 7 May 2025 19:34:57 -0500 Subject: [PATCH 16/24] clamp non const indices --- .../runtime/jiterpreter-trace-generator.ts | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/src/mono/browser/runtime/jiterpreter-trace-generator.ts b/src/mono/browser/runtime/jiterpreter-trace-generator.ts index 228e00747fb651..4bd233d6d95f4d 100644 --- a/src/mono/browser/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/browser/runtime/jiterpreter-trace-generator.ts @@ -3907,16 +3907,36 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu append_ldloc(builder, indicesOffset, WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); if (elementCount !== 16) { const shift = elementCount === 8 ? 1 : elementCount === 4 ? 2 : 3; + // clamp indices to the first invalid index which is elementCount + // this ensures that the multiply + add will not overflow the lane size + if (elementCount === 4) { + builder.i32_const(elementCount); + builder.appendSimd(WasmSimdOpcode.i32x4_splat); + builder.appendSimd(WasmSimdOpcode.i32x4_min_u); + } else if (elementCount === 8) { + builder.i32_const(elementCount); + builder.appendSimd(WasmSimdOpcode.i16x8_splat); + builder.appendSimd(WasmSimdOpcode.i16x8_min_u); + } else { + // i64x2 can fall back the the interpreter implementation + false; + } + // We need to convert lane indices to byte indices so we can - // use the swizzle opcode: + // use the swizzle opcode, The operations is the same as above but vectorized: + // i32x4{3, 2, 1, 0} => i8x16{12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3} // // 1: multiply the lane indices by elementSize using shl to // get the byte offset of the first byte in the 16 lanes + // {3,0,0,0, 2,0,0,0, 1,0,0,0, 0,0,0,0} 2 i16x8.shl + // => {12,0,0,0, 8,0,0,0, 4,0,0,0, 0,0,0,0} builder.i32_const(shift); builder.appendSimd(WasmSimdOpcode.i8x16_shl); // 2: create a vector to swizzle the now shifted first byte // of each lane into every byte of that lane. + // {12,0,0,0, 8,0,0,0, 4,0,0,0, 0,0,0,0} {0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3} i8x16.swizzle + // => {12,12,12,12, 8,8,8,8, 4,4,4,4, 0,0,0,0} builder.appendSimd(WasmSimdOpcode.v128_const); for (let i = 0; i < elementCount; i++) { for (let j = 0; j < elementSize; j++) @@ -3927,6 +3947,8 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu // 3: create a vector with the offset of each byte inside each // lane then Or it with the now shifted and swizzled indices. // It is safe to use Or directly thanks to the previous shift + // {12,12,12,12, 8,8,8,8, 4,4,4,4, 0,0,0,0} {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3} i8x16.or + // => {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3} builder.appendSimd(WasmSimdOpcode.v128_const); for (let i = 0; i < elementCount; i++) { for (let j = 0; j < elementSize; j++) From 343d2b359f16744598d9e04901958ca205591ee9 Mon Sep 17 00:00:00 2001 From: Larry Ewing Date: Wed, 7 May 2025 19:36:11 -0500 Subject: [PATCH 17/24] clamp non const indices --- src/mono/browser/runtime/jiterpreter-trace-generator.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mono/browser/runtime/jiterpreter-trace-generator.ts b/src/mono/browser/runtime/jiterpreter-trace-generator.ts index 4bd233d6d95f4d..f5c92c07980648 100644 --- a/src/mono/browser/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/browser/runtime/jiterpreter-trace-generator.ts @@ -3909,6 +3909,7 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu const shift = elementCount === 8 ? 1 : elementCount === 4 ? 2 : 3; // clamp indices to the first invalid index which is elementCount // this ensures that the multiply + add will not overflow the lane size + // but the indices will be invalid for the intrinsic if (elementCount === 4) { builder.i32_const(elementCount); builder.appendSimd(WasmSimdOpcode.i32x4_splat); From 9e80844584d982e591921b0063b7d532b5cc93a9 Mon Sep 17 00:00:00 2001 From: Larry Ewing Date: Wed, 7 May 2025 19:55:45 -0500 Subject: [PATCH 18/24] clean up comments --- .../browser/runtime/jiterpreter-trace-generator.ts | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/mono/browser/runtime/jiterpreter-trace-generator.ts b/src/mono/browser/runtime/jiterpreter-trace-generator.ts index f5c92c07980648..1f6fe4308954cc 100644 --- a/src/mono/browser/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/browser/runtime/jiterpreter-trace-generator.ts @@ -3909,7 +3909,7 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu const shift = elementCount === 8 ? 1 : elementCount === 4 ? 2 : 3; // clamp indices to the first invalid index which is elementCount // this ensures that the multiply + add will not overflow the lane size - // but the indices will be invalid for the intrinsic + // but the indices will remain invalid for the intrinsic if (elementCount === 4) { builder.i32_const(elementCount); builder.appendSimd(WasmSimdOpcode.i32x4_splat); @@ -3925,18 +3925,21 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu // We need to convert lane indices to byte indices so we can // use the swizzle opcode, The operations is the same as above but vectorized: + // for example: // i32x4{3, 2, 1, 0} => i8x16{12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3} // // 1: multiply the lane indices by elementSize using shl to // get the byte offset of the first byte in the 16 lanes - // {3,0,0,0, 2,0,0,0, 1,0,0,0, 0,0,0,0} 2 i16x8.shl + // i32x4{3, 2, 1, 0} + // ----------------------------------------- + // 2 i16x8.shl // => {12,0,0,0, 8,0,0,0, 4,0,0,0, 0,0,0,0} builder.i32_const(shift); builder.appendSimd(WasmSimdOpcode.i8x16_shl); // 2: create a vector to swizzle the now shifted first byte // of each lane into every byte of that lane. - // {12,0,0,0, 8,0,0,0, 4,0,0,0, 0,0,0,0} {0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3} i8x16.swizzle + // {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12} i8x16.swizzle // => {12,12,12,12, 8,8,8,8, 4,4,4,4, 0,0,0,0} builder.appendSimd(WasmSimdOpcode.v128_const); for (let i = 0; i < elementCount; i++) { @@ -3948,7 +3951,7 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu // 3: create a vector with the offset of each byte inside each // lane then Or it with the now shifted and swizzled indices. // It is safe to use Or directly thanks to the previous shift - // {12,12,12,12, 8,8,8,8, 4,4,4,4, 0,0,0,0} {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3} i8x16.or + // {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3} i8x16.or // => {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3} builder.appendSimd(WasmSimdOpcode.v128_const); for (let i = 0; i < elementCount; i++) { From 112339e3f7752c06e36bc3fd123e96cd0f90507d Mon Sep 17 00:00:00 2001 From: Larry Ewing Date: Wed, 7 May 2025 21:37:19 -0500 Subject: [PATCH 19/24] Address errors from review --- src/mono/browser/runtime/jiterpreter-trace-generator.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mono/browser/runtime/jiterpreter-trace-generator.ts b/src/mono/browser/runtime/jiterpreter-trace-generator.ts index 1f6fe4308954cc..eb0b42e6191bbf 100644 --- a/src/mono/browser/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/browser/runtime/jiterpreter-trace-generator.ts @@ -3872,7 +3872,7 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu if (typeof (constantIndices) === "object") { // HACK: We have a known constant shuffle vector indices. Expand it to // byte indices and then embed a new constant in the trace. - let nativeIndices = new Uint8Array(constantIndices.buffer, constantIndices.byteOffset, elementCount); + let nativeIndices = new Uint8Array(constantIndices.buffer, constantIndices.byteOffset, 16); if (elementCount !== 16) { const newShuffleVector = new Uint8Array(sizeOfV128); for (let i = 0, k = 0; i < elementCount; i++, k += elementSize) { @@ -3920,7 +3920,7 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu builder.appendSimd(WasmSimdOpcode.i16x8_min_u); } else { // i64x2 can fall back the the interpreter implementation - false; + return false; } // We need to convert lane indices to byte indices so we can From 5dcc5b9522590f8c87f0d4a0c8275b2d8467f926 Mon Sep 17 00:00:00 2001 From: Larry Ewing Date: Wed, 7 May 2025 22:47:48 -0500 Subject: [PATCH 20/24] handle i64x2 again --- .../runtime/jiterpreter-trace-generator.ts | 33 +++++++++++-------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/src/mono/browser/runtime/jiterpreter-trace-generator.ts b/src/mono/browser/runtime/jiterpreter-trace-generator.ts index eb0b42e6191bbf..21bab406b92126 100644 --- a/src/mono/browser/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/browser/runtime/jiterpreter-trace-generator.ts @@ -3877,16 +3877,14 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu const newShuffleVector = new Uint8Array(sizeOfV128); for (let i = 0, k = 0; i < elementCount; i++, k += elementSize) { // The indices are lane sized but are only valid when less than - // elementCount so we load them as bytes, read the first byte for - // the value then invalidate the value if any of the other bytes are != 0. + // elementCount. To avoid some complications we load them as bytes, read the + // first byte of the lane for the lane index then make sure remaining bytes are 0. let elementIndex = nativeIndices[k]; - // check the remaining bytes of the element for > 0 which also invalidates the index - for (let j = 1; j < elementSize && elementIndex < elementCount; j++) { - if (nativeIndices[k + j] > 0) { - elementIndex = elementCount; - } - } + // if any of the remaining bytes in the lane are != 0 it was an invalid index + // so we set the elementIndex to elementCount to force it to be invalid + for (let j = 1; j < elementSize && elementIndex < elementCount; j++) + elementIndex = nativeIndices[k + j] === 0 ? elementIndex : elementCount; for (let j = 0; j < elementSize; j++) { // The shuffle vector is lane sized but the swizzle opcode needs byte indices @@ -3899,7 +3897,6 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu } nativeIndices = newShuffleVector; } - // console.log(`shuffle w/element size ${elementSize} with constant indices ${nativeIndices} (${constantIndices}) -> byte indices ${newShuffleVector}`); builder.appendSimd(WasmSimdOpcode.v128_const); builder.appendBytes(nativeIndices); } else { @@ -3907,9 +3904,9 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu append_ldloc(builder, indicesOffset, WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); if (elementCount !== 16) { const shift = elementCount === 8 ? 1 : elementCount === 4 ? 2 : 3; - // clamp indices to the first invalid index which is elementCount + // clamp indices to the first invalid index value which is elementCount // this ensures that the multiply + add will not overflow the lane size - // but the indices will remain invalid for the intrinsic + // but the lane index will remain invalid for the intrinsic if (elementCount === 4) { builder.i32_const(elementCount); builder.appendSimd(WasmSimdOpcode.i32x4_splat); @@ -3918,9 +3915,6 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu builder.i32_const(elementCount); builder.appendSimd(WasmSimdOpcode.i16x8_splat); builder.appendSimd(WasmSimdOpcode.i16x8_min_u); - } else { - // i64x2 can fall back the the interpreter implementation - return false; } // We need to convert lane indices to byte indices so we can @@ -3959,6 +3953,17 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu builder.appendU8(j); } builder.appendSimd(WasmSimdOpcode.v128_or); + + // for i64x2 we don't have a min so we divide by 8 + // check if any bits are still set and if so, invalidate those lanes + if (elementCount == 2) { + append_ldloc(builder, indicesOffset, WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); + builder.i32_const(32); + builder.appendSimd(WasmSimdOpcode.i64x2_shr_u); + builder.v128_const(0); + builder.appendSimd(WasmSimdOpcode.i64x2_ne); + builder.appendSimd(WasmSimdOpcode.v128_or); + } } } // we now have two vectors on the stack, the values and the byte indices From 5f3766a28c34cb6e59c5a1a4bef590900e94af87 Mon Sep 17 00:00:00 2001 From: Larry Ewing Date: Wed, 7 May 2025 22:48:29 -0500 Subject: [PATCH 21/24] fix --- src/mono/browser/runtime/jiterpreter-trace-generator.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mono/browser/runtime/jiterpreter-trace-generator.ts b/src/mono/browser/runtime/jiterpreter-trace-generator.ts index 21bab406b92126..6eceb99c5a8de9 100644 --- a/src/mono/browser/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/browser/runtime/jiterpreter-trace-generator.ts @@ -3958,7 +3958,7 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu // check if any bits are still set and if so, invalidate those lanes if (elementCount == 2) { append_ldloc(builder, indicesOffset, WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); - builder.i32_const(32); + builder.i32_const(shift); builder.appendSimd(WasmSimdOpcode.i64x2_shr_u); builder.v128_const(0); builder.appendSimd(WasmSimdOpcode.i64x2_ne); From 33fcfd2286a7a11d539e141c0385b7d1875469bc Mon Sep 17 00:00:00 2001 From: Larry Ewing Date: Wed, 7 May 2025 22:50:05 -0500 Subject: [PATCH 22/24] fix --- src/mono/browser/runtime/jiterpreter-trace-generator.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mono/browser/runtime/jiterpreter-trace-generator.ts b/src/mono/browser/runtime/jiterpreter-trace-generator.ts index 6eceb99c5a8de9..baf90301a0297e 100644 --- a/src/mono/browser/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/browser/runtime/jiterpreter-trace-generator.ts @@ -3958,7 +3958,7 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu // check if any bits are still set and if so, invalidate those lanes if (elementCount == 2) { append_ldloc(builder, indicesOffset, WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); - builder.i32_const(shift); + builder.i32_const(1); builder.appendSimd(WasmSimdOpcode.i64x2_shr_u); builder.v128_const(0); builder.appendSimd(WasmSimdOpcode.i64x2_ne); From c07112d2d9fcc3b4ed8262ebe934a097977e1dfd Mon Sep 17 00:00:00 2001 From: Larry Ewing Date: Wed, 7 May 2025 22:51:19 -0500 Subject: [PATCH 23/24] fix --- src/mono/browser/runtime/jiterpreter-trace-generator.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/mono/browser/runtime/jiterpreter-trace-generator.ts b/src/mono/browser/runtime/jiterpreter-trace-generator.ts index baf90301a0297e..a9a15c62eb5fa4 100644 --- a/src/mono/browser/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/browser/runtime/jiterpreter-trace-generator.ts @@ -3954,8 +3954,9 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu } builder.appendSimd(WasmSimdOpcode.v128_or); - // for i64x2 we don't have a min so we divide by 8 - // check if any bits are still set and if so, invalidate those lanes + // for i64x2 we don't have a min so reload the original indices + // divide by 2 and check if any bits are still set. If so, invalidate + // those lanes if (elementCount == 2) { append_ldloc(builder, indicesOffset, WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); builder.i32_const(1); From aeb7f726187c9c3982e733f37b2136f892bfc913 Mon Sep 17 00:00:00 2001 From: Larry Ewing Date: Wed, 7 May 2025 23:11:59 -0500 Subject: [PATCH 24/24] More adjustments --- .../runtime/jiterpreter-trace-generator.ts | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/mono/browser/runtime/jiterpreter-trace-generator.ts b/src/mono/browser/runtime/jiterpreter-trace-generator.ts index a9a15c62eb5fa4..60c799eeb31138 100644 --- a/src/mono/browser/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/browser/runtime/jiterpreter-trace-generator.ts @@ -3882,9 +3882,8 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu let elementIndex = nativeIndices[k]; // if any of the remaining bytes in the lane are != 0 it was an invalid index - // so we set the elementIndex to elementCount to force it to be invalid for (let j = 1; j < elementSize && elementIndex < elementCount; j++) - elementIndex = nativeIndices[k + j] === 0 ? elementIndex : elementCount; + elementIndex = nativeIndices[k + j] === 0 ? elementIndex : 0xff; for (let j = 0; j < elementSize; j++) { // The shuffle vector is lane sized but the swizzle opcode needs byte indices @@ -3892,7 +3891,7 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu // the offset of the byte inside the lane. Since we are using lanes as indices // we need to check if the lane index is valid. We use min elementCount to force // invalid indices to fit in the byte sized lane and let the intrinsic handle it. - newShuffleVector[k + j] = Math.min(elementIndex * elementSize + j, elementCount); + newShuffleVector[k + j] = Math.min(elementIndex, elementCount) * elementSize + j; } } nativeIndices = newShuffleVector; @@ -3904,9 +3903,10 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu append_ldloc(builder, indicesOffset, WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); if (elementCount !== 16) { const shift = elementCount === 8 ? 1 : elementCount === 4 ? 2 : 3; - // clamp indices to the first invalid index value which is elementCount - // this ensures that the multiply + add will not overflow the lane size - // but the lane index will remain invalid for the intrinsic + // for i32x4 and i64x2 clamp indices to the first invalid index value which + // is elementCount this ensures that the multiply + add will not overflow + // the lane size but the lane index will remain invalid for the intrinsic to + // handle. if (elementCount === 4) { builder.i32_const(elementCount); builder.appendSimd(WasmSimdOpcode.i32x4_splat); @@ -3918,8 +3918,8 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu } // We need to convert lane indices to byte indices so we can - // use the swizzle opcode, The operations is the same as above but vectorized: - // for example: + // use the swizzle opcode, The operations is the same as above + // but vectorized. for example: // i32x4{3, 2, 1, 0} => i8x16{12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3} // // 1: multiply the lane indices by elementSize using shl to @@ -3955,8 +3955,8 @@ function emit_shuffle (builder: WasmBuilder, ip: MintOpcodePtr, elementCount: nu builder.appendSimd(WasmSimdOpcode.v128_or); // for i64x2 we don't have a min so reload the original indices - // divide by 2 and check if any bits are still set. If so, invalidate - // those lanes + // divide by 2 and check if any bits are still set. If so, + // invalidate those lanes if (elementCount == 2) { append_ldloc(builder, indicesOffset, WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); builder.i32_const(1);