Skip to content

Commit 33c3e47

Browse files
author
Lars T Hansen
committed
Bug 1724201 - Relaxed SIMD FMA/FMS for x86 and arm64. r=yury
Implement the fused multiply-add and fused multiply-sub relaxed SIMD operations. See WebAssembly/relaxed-simd#27 for proposed spec of these operations. There's no wat support for this yet - it will comes in separately - so the test cases are a little rudimentary for now. More tests will appear later. Differential Revision: https://phabricator.services.mozilla.com/D121870
1 parent ec888b6 commit 33c3e47

14 files changed

+248
-5
lines changed

js/src/jit-test/lib/wasm-binary.js

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,10 @@ const F64x2PMinCode = 0xf6;
133133
const F64x2PMaxCode = 0xf7;
134134
const V128Load32ZeroCode = 0xfc;
135135
const V128Load64ZeroCode = 0xfd;
136+
const F32x4RelaxedFmaCode = 0xaf;
137+
const F32x4RelaxedFmsCode = 0xb0;
138+
const F64x2RelaxedFmaCode = 0xcf;
139+
const F64x2RelaxedFmsCode = 0xd0;
136140

137141
// SIMD wormhole opcodes.
138142
const WORMHOLE_SELFTEST = 0;

js/src/jit-test/tests/wasm/simd/experimental.js

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
// |jit-test| --wasm-relaxed-simd; skip-if: !wasmSimdEnabled()
2+
13
// Experimental opcodes. We have no text parsing support for these yet. The
24
// tests will be cleaned up and moved into ad-hack.js if the opcodes are
35
// adopted.
@@ -61,4 +63,43 @@ function V128StoreExpr(addr, v) {
6163
SimdPrefix, V128StoreCode, 4, varU32(0)];
6264
}
6365

64-
// (Currently no tests here but there were some in the past and there will be more in the future.)
66+
// FMA/FMS, https://github.com/WebAssembly/relaxed-simd/issues/27
67+
68+
function fma(a, x, y) { return a + (x * y) }
69+
function fms(a, x, y) { return a - (x * y) }
70+
71+
var fas = [0, 100, 500, 700];
72+
var fxs = [10, 20, 30, 40];
73+
var fys = [-2, -3, -4, -5];
74+
var das = [0, 100];
75+
var dxs = [10, 20];
76+
var dys = [-2, -3];
77+
78+
for ( let [opcode, as, xs, ys, operator] of [[F32x4RelaxedFmaCode, fas, fxs, fys, fma],
79+
[F32x4RelaxedFmsCode, fas, fxs, fys, fms],
80+
[F64x2RelaxedFmaCode, das, dxs, dys, fma],
81+
[F64x2RelaxedFmsCode, das, dxs, dys, fms]] ) {
82+
var k = xs.length;
83+
var ans = iota(k).map((i) => operator(as[i], xs[i], ys[i]))
84+
85+
var ins = wasmEval(moduleWithSections([
86+
sigSection([v2vSig]),
87+
declSection([0]),
88+
memorySection(1),
89+
exportSection([{funcIndex: 0, name: "run"},
90+
{memIndex: 0, name: "mem"}]),
91+
bodySection([
92+
funcBody({locals:[],
93+
body: [...V128StoreExpr(0, [...V128Load(16),
94+
...V128Load(32),
95+
...V128Load(48),
96+
SimdPrefix, varU32(opcode)])]})])]));
97+
98+
var mem = new (k == 4 ? Float32Array : Float64Array)(ins.exports.mem.buffer);
99+
set(mem, k, as);
100+
set(mem, 2*k, xs);
101+
set(mem, 3*k, ys);
102+
ins.exports.run();
103+
var result = get(mem, 0, k);
104+
assertSame(result, ans);
105+
}

js/src/jit/MacroAssembler.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3455,6 +3455,20 @@ class MacroAssembler : public MacroAssemblerSpecific {
34553455
inline void nearestFloat64x2(FloatRegister src, FloatRegister dest)
34563456
DEFINED_ON(x86_shared, arm64);
34573457

3458+
// Floating multiply-accumulate: srcDest [+-]= src1 * src2
3459+
3460+
inline void fmaFloat32x4(FloatRegister src1, FloatRegister src2,
3461+
FloatRegister srcDest) DEFINED_ON(x86_shared, arm64);
3462+
3463+
inline void fmsFloat32x4(FloatRegister src1, FloatRegister src2,
3464+
FloatRegister srcDest) DEFINED_ON(x86_shared, arm64);
3465+
3466+
inline void fmaFloat64x2(FloatRegister src1, FloatRegister src2,
3467+
FloatRegister srcDest) DEFINED_ON(x86_shared, arm64);
3468+
3469+
inline void fmsFloat64x2(FloatRegister src1, FloatRegister src2,
3470+
FloatRegister srcDest) DEFINED_ON(x86_shared, arm64);
3471+
34583472
public:
34593473
// ========================================================================
34603474
// Truncate floating point.

js/src/jit/arm64/CodeGenerator-arm64.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2976,6 +2976,22 @@ void CodeGenerator::visitWasmTernarySimd128(LWasmTernarySimd128* ins) {
29762976
masm.bitwiseSelectSimd128(lhs, rhs, controlDest);
29772977
break;
29782978
}
2979+
case wasm::SimdOp::F32x4RelaxedFma:
2980+
masm.fmaFloat32x4(ToFloatRegister(ins->v1()), ToFloatRegister(ins->v2()),
2981+
ToFloatRegister(ins->v0()));
2982+
break;
2983+
case wasm::SimdOp::F32x4RelaxedFms:
2984+
masm.fmsFloat32x4(ToFloatRegister(ins->v1()), ToFloatRegister(ins->v2()),
2985+
ToFloatRegister(ins->v0()));
2986+
break;
2987+
case wasm::SimdOp::F64x2RelaxedFma:
2988+
masm.fmaFloat64x2(ToFloatRegister(ins->v1()), ToFloatRegister(ins->v2()),
2989+
ToFloatRegister(ins->v0()));
2990+
break;
2991+
case wasm::SimdOp::F64x2RelaxedFms:
2992+
masm.fmsFloat64x2(ToFloatRegister(ins->v1()), ToFloatRegister(ins->v2()),
2993+
ToFloatRegister(ins->v0()));
2994+
break;
29792995
default:
29802996
MOZ_CRASH("NYI");
29812997
}

js/src/jit/arm64/Lowering-arm64.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -994,6 +994,16 @@ void LIRGenerator::visitWasmTernarySimd128(MWasmTernarySimd128* ins) {
994994
defineReuseInput(lir, ins, LWasmTernarySimd128::V2);
995995
break;
996996
}
997+
case wasm::SimdOp::F32x4RelaxedFma:
998+
case wasm::SimdOp::F32x4RelaxedFms:
999+
case wasm::SimdOp::F64x2RelaxedFma:
1000+
case wasm::SimdOp::F64x2RelaxedFms: {
1001+
auto* lir = new (alloc())
1002+
LWasmTernarySimd128(ins->simdOp(), useRegisterAtStart(ins->v0()),
1003+
useRegister(ins->v1()), useRegister(ins->v2()));
1004+
defineReuseInput(lir, ins, LWasmTernarySimd128::V0);
1005+
break;
1006+
}
9971007
default:
9981008
MOZ_CRASH("NYI");
9991009
}

js/src/jit/arm64/MacroAssembler-arm64-inl.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3827,6 +3827,28 @@ void MacroAssembler::nearestFloat64x2(FloatRegister src, FloatRegister dest) {
38273827
Frintn(Simd2D(dest), Simd2D(src));
38283828
}
38293829

3830+
// Floating multiply-accumulate: srcDest [+-]= src1 * src2
3831+
3832+
void MacroAssembler::fmaFloat32x4(FloatRegister src1, FloatRegister src2,
3833+
FloatRegister srcDest) {
3834+
Fmla(Simd4S(srcDest), Simd4S(src1), Simd4S(src2));
3835+
}
3836+
3837+
void MacroAssembler::fmsFloat32x4(FloatRegister src1, FloatRegister src2,
3838+
FloatRegister srcDest) {
3839+
Fmls(Simd4S(srcDest), Simd4S(src1), Simd4S(src2));
3840+
}
3841+
3842+
void MacroAssembler::fmaFloat64x2(FloatRegister src1, FloatRegister src2,
3843+
FloatRegister srcDest) {
3844+
Fmla(Simd2D(srcDest), Simd2D(src1), Simd2D(src2));
3845+
}
3846+
3847+
void MacroAssembler::fmsFloat64x2(FloatRegister src1, FloatRegister src2,
3848+
FloatRegister srcDest) {
3849+
Fmls(Simd2D(srcDest), Simd2D(src1), Simd2D(src2));
3850+
}
3851+
38303852
//}}} check_macroassembler_style
38313853
// ===============================================================
38323854

js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2270,6 +2270,22 @@ void CodeGenerator::visitWasmTernarySimd128(LWasmTernarySimd128* ins) {
22702270
masm.bitwiseSelectSimd128(control, lhsDest, rhs, lhsDest, temp);
22712271
break;
22722272
}
2273+
case wasm::SimdOp::F32x4RelaxedFma:
2274+
masm.fmaFloat32x4(ToFloatRegister(ins->v1()), ToFloatRegister(ins->v2()),
2275+
ToFloatRegister(ins->v0()));
2276+
break;
2277+
case wasm::SimdOp::F32x4RelaxedFms:
2278+
masm.fmsFloat32x4(ToFloatRegister(ins->v1()), ToFloatRegister(ins->v2()),
2279+
ToFloatRegister(ins->v0()));
2280+
break;
2281+
case wasm::SimdOp::F64x2RelaxedFma:
2282+
masm.fmaFloat64x2(ToFloatRegister(ins->v1()), ToFloatRegister(ins->v2()),
2283+
ToFloatRegister(ins->v0()));
2284+
break;
2285+
case wasm::SimdOp::F64x2RelaxedFms:
2286+
masm.fmsFloat64x2(ToFloatRegister(ins->v1()), ToFloatRegister(ins->v2()),
2287+
ToFloatRegister(ins->v0()));
2288+
break;
22732289
default:
22742290
MOZ_CRASH("NYI");
22752291
}

js/src/jit/x86-shared/Lowering-x86-shared.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -832,6 +832,16 @@ void LIRGenerator::visitWasmTernarySimd128(MWasmTernarySimd128* ins) {
832832
defineReuseInput(lir, ins, LWasmTernarySimd128::V0);
833833
break;
834834
}
835+
case wasm::SimdOp::F32x4RelaxedFma:
836+
case wasm::SimdOp::F32x4RelaxedFms:
837+
case wasm::SimdOp::F64x2RelaxedFma:
838+
case wasm::SimdOp::F64x2RelaxedFms: {
839+
auto* lir = new (alloc())
840+
LWasmTernarySimd128(ins->simdOp(), useRegisterAtStart(ins->v0()),
841+
useRegister(ins->v1()), useRegister(ins->v2()));
842+
defineReuseInput(lir, ins, LWasmTernarySimd128::V0);
843+
break;
844+
}
835845
default:
836846
MOZ_CRASH("NYI");
837847
}

js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2837,6 +2837,41 @@ void MacroAssembler::unsignedWidenHighInt32x4(FloatRegister src,
28372837
vpmovzxdq(Operand(dest), dest);
28382838
}
28392839

2840+
// Floating multiply-accumulate: srcDest [+-]= src1 * src2
2841+
// The Intel FMA feature is some AVX* special sauce, no support yet.
2842+
2843+
void MacroAssembler::fmaFloat32x4(FloatRegister src1, FloatRegister src2,
2844+
FloatRegister srcDest) {
2845+
ScratchFloat32Scope scratch(*this);
2846+
moveSimd128(src1, scratch);
2847+
mulFloat32x4(src2, scratch);
2848+
addFloat32x4(scratch, srcDest);
2849+
}
2850+
2851+
void MacroAssembler::fmsFloat32x4(FloatRegister src1, FloatRegister src2,
2852+
FloatRegister srcDest) {
2853+
ScratchFloat32Scope scratch(*this);
2854+
moveSimd128(src1, scratch);
2855+
mulFloat32x4(src2, scratch);
2856+
subFloat32x4(scratch, srcDest);
2857+
}
2858+
2859+
void MacroAssembler::fmaFloat64x2(FloatRegister src1, FloatRegister src2,
2860+
FloatRegister srcDest) {
2861+
ScratchFloat32Scope scratch(*this);
2862+
moveSimd128(src1, scratch);
2863+
mulFloat64x2(src2, scratch);
2864+
addFloat64x2(scratch, srcDest);
2865+
}
2866+
2867+
void MacroAssembler::fmsFloat64x2(FloatRegister src1, FloatRegister src2,
2868+
FloatRegister srcDest) {
2869+
ScratchFloat32Scope scratch(*this);
2870+
moveSimd128(src1, scratch);
2871+
mulFloat64x2(src2, scratch);
2872+
subFloat64x2(scratch, srcDest);
2873+
}
2874+
28402875
// ========================================================================
28412876
// Truncate floating point.
28422877

js/src/wasm/WasmBaselineCompile.cpp

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15240,6 +15240,28 @@ static void BitselectV128(MacroAssembler& masm, RegV128 rhs, RegV128 control,
1524015240
}
1524115241
# endif
1524215242

15243+
# ifdef ENABLE_WASM_RELAXED_SIMD
15244+
static void RelaxedFmaF32x4(MacroAssembler& masm, RegV128 rs1, RegV128 rs2,
15245+
RegV128 rsd) {
15246+
masm.fmaFloat32x4(rs1, rs2, rsd);
15247+
}
15248+
15249+
static void RelaxedFmsF32x4(MacroAssembler& masm, RegV128 rs1, RegV128 rs2,
15250+
RegV128 rsd) {
15251+
masm.fmsFloat32x4(rs1, rs2, rsd);
15252+
}
15253+
15254+
static void RelaxedFmaF64x2(MacroAssembler& masm, RegV128 rs1, RegV128 rs2,
15255+
RegV128 rsd) {
15256+
masm.fmaFloat64x2(rs1, rs2, rsd);
15257+
}
15258+
15259+
static void RelaxedFmsF64x2(MacroAssembler& masm, RegV128 rs1, RegV128 rs2,
15260+
RegV128 rsd) {
15261+
masm.fmsFloat64x2(rs1, rs2, rsd);
15262+
}
15263+
# endif
15264+
1524315265
void BaseCompiler::emitVectorAndNot() {
1524415266
// We want x & ~y but the available operation is ~x & y, so reverse the
1524515267
// operands.
@@ -16950,6 +16972,29 @@ bool BaseCompiler::emitBody() {
1695016972
CHECK_NEXT(emitStoreLane(4));
1695116973
case uint32_t(SimdOp::V128Store64Lane):
1695216974
CHECK_NEXT(emitStoreLane(8));
16975+
# ifdef ENABLE_WASM_RELAXED_SIMD
16976+
case uint32_t(SimdOp::F32x4RelaxedFma):
16977+
if (!moduleEnv_.v128RelaxedEnabled()) {
16978+
return iter_.unrecognizedOpcode(&op);
16979+
}
16980+
CHECK_NEXT(dispatchTernary1(RelaxedFmaF32x4, ValType::V128));
16981+
case uint32_t(SimdOp::F32x4RelaxedFms):
16982+
if (!moduleEnv_.v128RelaxedEnabled()) {
16983+
return iter_.unrecognizedOpcode(&op);
16984+
}
16985+
CHECK_NEXT(dispatchTernary1(RelaxedFmsF32x4, ValType::V128));
16986+
case uint32_t(SimdOp::F64x2RelaxedFma):
16987+
if (!moduleEnv_.v128RelaxedEnabled()) {
16988+
return iter_.unrecognizedOpcode(&op);
16989+
}
16990+
CHECK_NEXT(dispatchTernary1(RelaxedFmaF64x2, ValType::V128));
16991+
case uint32_t(SimdOp::F64x2RelaxedFms):
16992+
if (!moduleEnv_.v128RelaxedEnabled()) {
16993+
return iter_.unrecognizedOpcode(&op);
16994+
}
16995+
CHECK_NEXT(dispatchTernary1(RelaxedFmsF64x2, ValType::V128));
16996+
break;
16997+
# endif
1695316998
default:
1695416999
break;
1695517000
} // switch (op.b1)

js/src/wasm/WasmConstants.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -676,8 +676,8 @@ enum class SimdOp {
676676
I32x4ShrS = 0xac,
677677
I32x4ShrU = 0xad,
678678
I32x4Add = 0xae,
679-
// AddSatS = 0xaf
680-
// AddSatU = 0xb0
679+
F32x4RelaxedFma = 0xaf,
680+
F32x4RelaxedFms = 0xb0,
681681
I32x4Sub = 0xb1,
682682
// SubSatS = 0xb2
683683
// SubSatU = 0xb3
@@ -708,8 +708,8 @@ enum class SimdOp {
708708
I64x2ShrS = 0xcc,
709709
I64x2ShrU = 0xcd,
710710
I64x2Add = 0xce,
711-
// Unused = 0xcf
712-
// Unused = 0xd0
711+
F64x2RelaxedFma = 0xcf,
712+
F64x2RelaxedFms = 0xd0,
713713
I64x2Sub = 0xd1,
714714
// Unused = 0xd2
715715
// Unused = 0xd3

js/src/wasm/WasmIonCompile.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5384,6 +5384,18 @@ static bool EmitBodyExprs(FunctionCompiler& f) {
53845384
CHECK(EmitStoreLaneSimd128(f, 4));
53855385
case uint32_t(SimdOp::V128Store64Lane):
53865386
CHECK(EmitStoreLaneSimd128(f, 8));
5387+
# ifdef ENABLE_WASM_RELAXED_SIMD
5388+
case uint32_t(SimdOp::F32x4RelaxedFma):
5389+
case uint32_t(SimdOp::F32x4RelaxedFms):
5390+
case uint32_t(SimdOp::F64x2RelaxedFma):
5391+
case uint32_t(SimdOp::F64x2RelaxedFms): {
5392+
if (!f.moduleEnv().v128RelaxedEnabled()) {
5393+
return f.iter().unrecognizedOpcode(&op);
5394+
}
5395+
CHECK(EmitTernarySimd128(f, SimdOp(op.b1)));
5396+
}
5397+
# endif
5398+
53875399
default:
53885400
return f.iter().unrecognizedOpcode(&op);
53895401
} // switch (op.b1)

js/src/wasm/WasmOpIter.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -591,6 +591,11 @@ OpKind wasm::Classify(OpBytes op) {
591591
case SimdOp::V128Store32Lane:
592592
case SimdOp::V128Store64Lane:
593593
WASM_SIMD_OP(OpKind::StoreLane);
594+
case SimdOp::F32x4RelaxedFma:
595+
case SimdOp::F32x4RelaxedFms:
596+
case SimdOp::F64x2RelaxedFma:
597+
case SimdOp::F64x2RelaxedFms:
598+
WASM_SIMD_OP(OpKind::Ternary);
594599
# ifdef ENABLE_WASM_SIMD_WORMHOLE
595600
case SimdOp::MozWHSELFTEST:
596601
case SimdOp::MozWHPMADDUBSW:

js/src/wasm/WasmValidate.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1007,6 +1007,19 @@ static bool DecodeFunctionBodyExprs(const ModuleEnvironment& env,
10071007
CHECK(iter.readStoreLane(8, &addr, &noIndex, &nothing));
10081008
}
10091009

1010+
# ifdef ENABLE_WASM_RELAXED_SIMD
1011+
case uint32_t(SimdOp::F32x4RelaxedFma):
1012+
case uint32_t(SimdOp::F32x4RelaxedFms):
1013+
case uint32_t(SimdOp::F64x2RelaxedFma):
1014+
case uint32_t(SimdOp::F64x2RelaxedFms): {
1015+
if (!env.v128RelaxedEnabled()) {
1016+
return iter.unrecognizedOpcode(&op);
1017+
}
1018+
CHECK(
1019+
iter.readTernary(ValType::V128, &nothing, &nothing, &nothing));
1020+
}
1021+
# endif
1022+
10101023
default:
10111024
return iter.unrecognizedOpcode(&op);
10121025
}

0 commit comments

Comments
 (0)