From ae616fafd54c872f6e16f32296eca419a9064fe2 Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Sat, 7 Oct 2023 19:33:01 -0400 Subject: [PATCH 1/6] [mono][amd64] Pass SIMD arguments in SIMD registers. --- src/mono/mono/mini/mini-amd64.c | 45 ++++++++++++++++++++++++++++----- src/mono/mono/mini/mini-amd64.h | 2 ++ 2 files changed, 41 insertions(+), 6 deletions(-) diff --git a/src/mono/mono/mini/mini-amd64.c b/src/mono/mono/mini/mini-amd64.c index 6278ae4b9aea7a..bb48796f9bf967 100644 --- a/src/mono/mono/mini/mini-amd64.c +++ b/src/mono/mono/mini/mini-amd64.c @@ -674,6 +674,18 @@ add_valuetype (MonoMethodSignature *sig, ArgInfo *ainfo, MonoType *type, return; } + /* Can't use mini_class_is_simd () here as we don't have access to a MonoCompile */ + if (m_class_is_simd_type (klass) && struct_size == 16 && !sig->pinvoke && !is_return) { + if (*fr >= FLOAT_PARAM_REGS) { + pass_on_stack = TRUE; + } else { + ainfo->storage = ArgSIMDInSSEReg; + ainfo->reg = (guint8)*fr; + (*fr) ++; + return; + } + } + if (pass_on_stack) { /* Always pass in memory */ ainfo->offset = GINT32_TO_INT16 (*stack_size); @@ -1877,7 +1889,7 @@ mono_arch_allocate_vars (MonoCompile *cfg) * are volatile across calls. * FIXME: Optimize this. */ - if ((ainfo->storage == ArgInIReg) || (ainfo->storage == ArgInFloatSSEReg) || (ainfo->storage == ArgInDoubleSSEReg) || (ainfo->storage == ArgValuetypeInReg) || (ainfo->storage == ArgGSharedVtInReg)) + if ((ainfo->storage == ArgInIReg) || (ainfo->storage == ArgInFloatSSEReg) || (ainfo->storage == ArgInDoubleSSEReg) || (ainfo->storage == ArgSIMDInSSEReg) || (ainfo->storage == ArgValuetypeInReg) || (ainfo->storage == ArgGSharedVtInReg)) inreg = FALSE; ins->opcode = OP_REGOFFSET; @@ -1886,6 +1898,7 @@ mono_arch_allocate_vars (MonoCompile *cfg) case ArgInIReg: case ArgInFloatSSEReg: case ArgInDoubleSSEReg: + case ArgSIMDInSSEReg: case ArgGSharedVtInReg: if (inreg) { ins->opcode = OP_REGVAR; @@ -1933,14 +1946,23 @@ mono_arch_allocate_vars (MonoCompile *cfg) ins->opcode = OP_REGOFFSET; ins->inst_basereg = cfg->frame_reg; /* These arguments are saved to the stack in the prolog */ - offset = ALIGN_TO (offset, sizeof (target_mgreg_t)); + int argsize, align; + if (ainfo->storage == ArgSIMDInSSEReg) { + argsize = 16; + align = 16; + offset = ALIGN_TO (offset, 16); + } else { + argsize = (ainfo->storage == ArgValuetypeInReg) ? ainfo->nregs * sizeof (target_mgreg_t) : sizeof (target_mgreg_t); + align = sizeof (target_mgreg_t); + } + offset = ALIGN_TO (offset, align); if (cfg->arch.omit_fp) { ins->inst_offset = offset; - offset += (ainfo->storage == ArgValuetypeInReg) ? ainfo->nregs * sizeof (target_mgreg_t) : sizeof (target_mgreg_t); + offset += argsize; // Arguments are yet supported by the stack map creation code //cfg->locals_max_stack_offset = MAX (cfg->locals_max_stack_offset, offset); } else { - offset += (ainfo->storage == ArgValuetypeInReg) ? ainfo->nregs * sizeof (target_mgreg_t) : sizeof (target_mgreg_t); + offset += argsize; ins->inst_offset = - offset; //cfg->locals_min_stack_offset = MIN (cfg->locals_min_stack_offset, offset); } @@ -2027,7 +2049,15 @@ add_outarg_reg (MonoCompile *cfg, MonoCallInst *call, ArgStorage storage, int re MONO_ADD_INS (cfg->cbb, ins); mono_call_inst_add_outarg_reg (cfg, call, ins->dreg, reg, TRUE); + break; + case ArgSIMDInSSEReg: + MONO_INST_NEW (cfg, ins, OP_XMOVE); + ins->dreg = alloc_xreg (cfg); + ins->sreg1 = tree->dreg; + ins->klass = tree->klass; + MONO_ADD_INS (cfg->cbb, ins); + mono_call_inst_add_outarg_reg (cfg, call, ins->dreg, reg, MONO_REG_SIMD); break; default: g_assert_not_reached (); @@ -2171,10 +2201,9 @@ mono_arch_get_llvm_call_info (MonoCompile *cfg, MonoMethodSignature *sig) switch (ainfo->storage) { case ArgInIReg: - linfo->args [i].storage = LLVMArgNormal; - break; case ArgInDoubleSSEReg: case ArgInFloatSSEReg: + case ArgSIMDInSSEReg: linfo->args [i].storage = LLVMArgNormal; break; case ArgOnStack: @@ -2318,6 +2347,7 @@ mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call) break; case ArgInFloatSSEReg: case ArgInDoubleSSEReg: + case ArgSIMDInSSEReg: add_outarg_reg (cfg, call, ainfo->storage, ainfo->reg, in); break; case ArgOnStack: @@ -8094,6 +8124,9 @@ MONO_RESTORE_WARNING case ArgInDoubleSSEReg: amd64_movsd_membase_reg (code, ins->inst_basereg, ins->inst_offset, ainfo->reg); break; + case ArgSIMDInSSEReg: + amd64_sse_movups_membase_reg (code, ins->inst_basereg, ins->inst_offset, ainfo->reg); + break; case ArgValuetypeInReg: for (quad = 0; quad < 2; quad ++) { switch (ainfo->pair_storage [quad]) { diff --git a/src/mono/mono/mini/mini-amd64.h b/src/mono/mono/mini/mini-amd64.h index 2c775c1585545b..def055b0b0cd18 100644 --- a/src/mono/mono/mini/mini-amd64.h +++ b/src/mono/mono/mini/mini-amd64.h @@ -292,6 +292,8 @@ typedef enum { ArgValuetypeInReg, ArgValuetypeAddrInIReg, ArgValuetypeAddrOnStack, + /* SIMD value in SSE register */ + ArgSIMDInSSEReg, /* gsharedvt argument passed by addr */ ArgGSharedVtInReg, ArgGSharedVtOnStack, From e97315b31173a0f865b821cd292d1a7c4a252be9 Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Wed, 11 Oct 2023 16:01:44 -0400 Subject: [PATCH 2/6] Pass simd arguments using a different xmove_arg opcode. XMOVE can be optimized away which creates problems if the argument is on the stack, since mono_spill_global_vars () doesn't process call arguments, it only processes the move instructions before it. If a normal argument is on the stack, it means its volatile/indirect, so its move is not optimized away, but for simd args, they can become indirect when the xcall opcode is decomposed which can happen after the xmove was optimized away, so we end up with code like: ``` ldaddr R19 <- R20 vcall2 voidcall [..] [s:xmm0 <- R20] ``` --- src/mono/mono/mini/cpu-amd64.mdesc | 1 + src/mono/mono/mini/mini-amd64.c | 5 ++++- src/mono/mono/mini/mini-llvm.c | 1 + src/mono/mono/mini/mini-ops.h | 2 ++ 4 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/mono/mono/mini/cpu-amd64.mdesc b/src/mono/mono/mini/cpu-amd64.mdesc index a380b4dc7b4a1d..980eb72546abea 100644 --- a/src/mono/mono/mini/cpu-amd64.mdesc +++ b/src/mono/mono/mini/cpu-amd64.mdesc @@ -786,6 +786,7 @@ cvttpd2dq: dest:x src1:x len:5 clob:1 cvttps2dq: dest:x src1:x len:5 clob:1 xmove: dest:x src1:x len:5 +xmove_arg: dest:x src1:x len:5 xzero: dest:x len:5 xones: dest:x len:5 xconst: dest:x len:12 diff --git a/src/mono/mono/mini/mini-amd64.c b/src/mono/mono/mini/mini-amd64.c index bb48796f9bf967..1a763b8ae38c7f 100644 --- a/src/mono/mono/mini/mini-amd64.c +++ b/src/mono/mono/mini/mini-amd64.c @@ -2051,7 +2051,7 @@ add_outarg_reg (MonoCompile *cfg, MonoCallInst *call, ArgStorage storage, int re mono_call_inst_add_outarg_reg (cfg, call, ins->dreg, reg, TRUE); break; case ArgSIMDInSSEReg: - MONO_INST_NEW (cfg, ins, OP_XMOVE); + MONO_INST_NEW (cfg, ins, OP_XMOVE_ARG); ins->dreg = alloc_xreg (cfg); ins->sreg1 = tree->dreg; ins->klass = tree->klass; @@ -7532,6 +7532,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) if (ins->dreg != ins->sreg1) amd64_sse_movaps_reg_reg (code, ins->dreg, ins->sreg1); break; + case OP_XMOVE_ARG: + amd64_sse_movaps_reg_reg (code, ins->dreg, ins->sreg1); + break; case OP_XZERO: amd64_sse_pxor_reg_reg (code, ins->dreg, ins->dreg); break; diff --git a/src/mono/mono/mini/mini-llvm.c b/src/mono/mono/mini/mini-llvm.c index 6ba6a106d34a2a..9e649a22df80b4 100644 --- a/src/mono/mono/mini/mini-llvm.c +++ b/src/mono/mono/mini/mini-llvm.c @@ -6294,6 +6294,7 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb) case OP_MOVE: case OP_LMOVE: case OP_XMOVE: + case OP_XMOVE_ARG: case OP_SETFRET: g_assert (lhs); values [ins->dreg] = lhs; diff --git a/src/mono/mono/mini/mini-ops.h b/src/mono/mono/mini/mini-ops.h index 17dbc05d8763a1..f8a6f4f3104f2c 100644 --- a/src/mono/mono/mini/mini-ops.h +++ b/src/mono/mono/mini/mini-ops.h @@ -1222,6 +1222,8 @@ MINI_OP(OP_XZERO, "xzero", XREG, NONE, NONE) MINI_OP(OP_XONES, "xones", XREG, NONE, NONE) MINI_OP(OP_XCONST, "xconst", XREG, NONE, NONE) MINI_OP(OP_XPHI, "xphi", XREG, NONE, NONE) +/* Same as xmove, but not optimized away by copyprop etc. */ +MINI_OP(OP_XMOVE_ARG, "xmove_arg", XREG, XREG, NONE) /* * These are used for efficient implementation of the From 7e0872d61b5f1d30445366ba3fe346d35e4d72dc Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Wed, 11 Oct 2023 20:28:48 -0400 Subject: [PATCH 3/6] More fixes. --- src/mono/mono/mini/method-to-ir.c | 34 +++++++++++++++++++++++++++++++ src/mono/mono/mini/mini-amd64.c | 15 -------------- src/mono/mono/mini/mini-llvm.c | 28 ++++++++++--------------- src/mono/mono/mini/mini.h | 2 ++ 4 files changed, 47 insertions(+), 32 deletions(-) diff --git a/src/mono/mono/mini/method-to-ir.c b/src/mono/mono/mini/method-to-ir.c index fd8a31e1e4360e..df19553c83695d 100644 --- a/src/mono/mono/mini/method-to-ir.c +++ b/src/mono/mono/mini/method-to-ir.c @@ -250,6 +250,12 @@ mono_alloc_preg (MonoCompile *cfg) return alloc_preg (cfg); } +guint32 +mono_alloc_xreg (MonoCompile *cfg) +{ + return alloc_xreg (cfg); +} + guint32 mono_alloc_dreg (MonoCompile *cfg, MonoStackType stack_type) { @@ -1692,6 +1698,34 @@ MONO_RESTORE_WARNING return ins; } +MonoInst* +mini_emit_regmove (MonoCompile *cfg, int sreg, MonoType *type) +{ + MonoInst *ins; + int opcode = mono_type_to_regmove (cfg, type); + + if (opcode == OP_FMOVE) { + MONO_INST_NEW (cfg, ins, OP_FMOVE); + ins->dreg = mono_alloc_freg (cfg); + } else if (opcode == OP_LMOVE) { + MONO_INST_NEW (cfg, ins, OP_LMOVE); + ins->dreg = mono_alloc_lreg (cfg); + } else if (opcode == OP_RMOVE) { + MONO_INST_NEW (cfg, ins, OP_RMOVE); + ins->dreg = mono_alloc_freg (cfg); + } else if (opcode == OP_XMOVE) { + MONO_INST_NEW (cfg, ins, OP_XMOVE); + ins->dreg = mono_alloc_xreg (cfg); + ins->klass = mono_class_from_mono_type_internal (type); + } else { + MONO_INST_NEW (cfg, ins, OP_MOVE); + ins->dreg = mono_alloc_ireg (cfg); + } + ins->sreg1 = sreg; + + return ins; +} + static MonoInst* mono_create_fast_tls_getter (MonoCompile *cfg, MonoTlsKey key) { diff --git a/src/mono/mono/mini/mini-amd64.c b/src/mono/mono/mini/mini-amd64.c index 1a763b8ae38c7f..581f565f437fa2 100644 --- a/src/mono/mono/mini/mini-amd64.c +++ b/src/mono/mono/mini/mini-amd64.c @@ -2221,21 +2221,6 @@ mono_arch_get_llvm_call_info (MonoCompile *cfg, MonoMethodSignature *sig) return linfo; } -#if 0 - /* FIXME: the non-LLVM codegen should also pass arguments in registers or - * else there could a mismatch when LLVM code calls non-LLVM code - * - * See https://github.com/dotnet/runtime/issues/73454 - */ - if ((t->type == MONO_TYPE_GENERICINST) && !cfg->full_aot && !sig->pinvoke) { - MonoClass *klass = mono_class_from_mono_type_internal (t); - if (mini_class_is_simd (cfg, klass)) { - linfo->args [i].storage = LLVMArgVtypeInSIMDReg; - break; - } - } -#endif - linfo->args [i].storage = LLVMArgVtypeInReg; for (j = 0; j < 2; ++j) linfo->args [i].pair_storage [j] = arg_storage_to_llvm_arg_storage (cfg, ainfo->pair_storage [j]); diff --git a/src/mono/mono/mini/mini-llvm.c b/src/mono/mono/mini/mini-llvm.c index 9e649a22df80b4..b61ede102059c8 100644 --- a/src/mono/mono/mini/mini-llvm.c +++ b/src/mono/mono/mini/mini-llvm.c @@ -4060,6 +4060,15 @@ emit_entry_bb (EmitContext *ctx, LLVMBuilderRef builder) } break; default: { + if (m_class_is_simd_type (mono_class_from_mono_type_internal (ainfo->type))) { + /* SIMD value passed by value */ + if (ctx->addresses [reg]) { + LLVMValueRef arg = LLVMGetParam (ctx->lmethod, pindex); + LLVMBuildStore (builder, arg, build_ptr_cast (builder, ctx->addresses [reg]->value, pointer_type(LLVMTypeOf (arg)))); + } + break; + } + LLVMTypeRef t; /* Needed to avoid phi argument mismatch errors since operations on pointers produce i32/i64 */ if (m_type_is_byref (ainfo->type)) @@ -13293,23 +13302,8 @@ mono_llvm_emit_call (MonoCompile *cfg, MonoCallInst *call) switch (ainfo->storage) { case LLVMArgNormal: { MonoType *t = (sig->hasthis && i == 0) ? m_class_get_byval_arg (mono_get_intptr_class ()) : ainfo->type; - int opcode; - - opcode = mono_type_to_regmove (cfg, t); - if (opcode == OP_FMOVE) { - MONO_INST_NEW (cfg, ins, OP_FMOVE); - ins->dreg = mono_alloc_freg (cfg); - } else if (opcode == OP_LMOVE) { - MONO_INST_NEW (cfg, ins, OP_LMOVE); - ins->dreg = mono_alloc_lreg (cfg); - } else if (opcode == OP_RMOVE) { - MONO_INST_NEW (cfg, ins, OP_RMOVE); - ins->dreg = mono_alloc_freg (cfg); - } else { - MONO_INST_NEW (cfg, ins, OP_MOVE); - ins->dreg = mono_alloc_ireg (cfg); - } - ins->sreg1 = in->dreg; + + ins = mini_emit_regmove (cfg, in->dreg, t); break; } case LLVMArgVtypeByVal: diff --git a/src/mono/mono/mini/mini.h b/src/mono/mono/mini/mini.h index 4924a0be9d4899..825582a587c1e5 100644 --- a/src/mono/mono/mini/mini.h +++ b/src/mono/mono/mini/mini.h @@ -2168,6 +2168,7 @@ guint32 mono_alloc_ireg (MonoCompile *cfg); guint32 mono_alloc_lreg (MonoCompile *cfg); guint32 mono_alloc_freg (MonoCompile *cfg); guint32 mono_alloc_preg (MonoCompile *cfg); +guint32 mono_alloc_xreg (MonoCompile *cfg); guint32 mono_alloc_dreg (MonoCompile *cfg, MonoStackType stack_type); guint32 mono_alloc_ireg_ref (MonoCompile *cfg); guint32 mono_alloc_ireg_mp (MonoCompile *cfg); @@ -2350,6 +2351,7 @@ MonoInst* mini_emit_array_store (MonoCompile *cfg, MonoClass *klass, Mon MonoInst* mini_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args, gboolean *ins_type_initialized); MonoInst* mini_emit_inst_for_ctor (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args); MonoInst* mini_emit_inst_for_field_load (MonoCompile *cfg, MonoClassField *field); +MonoInst* mini_emit_regmove (MonoCompile *cfg, int sreg, MonoType *type); MonoInst* mini_handle_enum_has_flag (MonoCompile *cfg, MonoClass *klass, MonoInst *enum_this, int enum_val_reg, MonoInst *enum_flag); MonoInst* mini_handle_unbox (MonoCompile *cfg, MonoClass *klass, MonoInst *val, int context_used); From 78954829564eb2ccef15bf18e9298e869b38cfae Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Tue, 17 Oct 2023 16:55:30 -0400 Subject: [PATCH 4/6] Return SIMD values in registers as well. --- src/mono/mono/mini/cpu-amd64.mdesc | 3 +++ src/mono/mono/mini/decompose.c | 2 +- src/mono/mono/mini/mini-amd64.c | 36 +++++++++++++++++++++++++++--- src/mono/mono/mini/mini.h | 1 + 4 files changed, 38 insertions(+), 4 deletions(-) diff --git a/src/mono/mono/mini/cpu-amd64.mdesc b/src/mono/mono/mini/cpu-amd64.mdesc index 980eb72546abea..edc96b80ff6726 100644 --- a/src/mono/mono/mini/cpu-amd64.mdesc +++ b/src/mono/mono/mini/cpu-amd64.mdesc @@ -171,6 +171,9 @@ vcall_reg: src1:i len:64 clob:c vcall_membase: src1:b len:64 clob:c call_reg: dest:a src1:i len:32 clob:c call_membase: dest:a src1:b len:32 clob:c +xcall: dest:x len:64 clob:c +xcall_reg: dest:x src1:i len:64 clob:c +xcall_membase: dest:x src1:b len:64 clob:c iconst: dest:i len:10 i8const: dest:i len:10 r4const: dest:f len:17 diff --git a/src/mono/mono/mini/decompose.c b/src/mono/mono/mini/decompose.c index 84221fc808afb6..5c6e0a33e436c5 100644 --- a/src/mono/mono/mini/decompose.c +++ b/src/mono/mono/mini/decompose.c @@ -1344,7 +1344,7 @@ mono_decompose_vtype_opts (MonoCompile *cfg) MonoCallInst *call = (MonoCallInst*)ins; int size; - if (COMPILE_LLVM (cfg)) + if (COMPILE_LLVM (cfg) || call->dont_decompose) break; if (call->vret_in_reg) { diff --git a/src/mono/mono/mini/mini-amd64.c b/src/mono/mono/mini/mini-amd64.c index 581f565f437fa2..100b4d58be24ff 100644 --- a/src/mono/mono/mini/mini-amd64.c +++ b/src/mono/mono/mini/mini-amd64.c @@ -674,8 +674,13 @@ add_valuetype (MonoMethodSignature *sig, ArgInfo *ainfo, MonoType *type, return; } - /* Can't use mini_class_is_simd () here as we don't have access to a MonoCompile */ - if (m_class_is_simd_type (klass) && struct_size == 16 && !sig->pinvoke && !is_return) { +#ifndef TARGET_WIN32 + /* + * Can't use mini_class_is_simd () here as we don't have access to a MonoCompile. + * So have to restict the types to the ones which are supported both in llvm and + * non-llvm mode. + */ + if (m_class_is_simd_type (klass) && struct_size == 16 && !sig->pinvoke) { if (*fr >= FLOAT_PARAM_REGS) { pass_on_stack = TRUE; } else { @@ -685,6 +690,7 @@ add_valuetype (MonoMethodSignature *sig, ArgInfo *ainfo, MonoType *type, return; } } +#endif if (pass_on_stack) { /* Always pass in memory */ @@ -1802,6 +1808,7 @@ mono_arch_allocate_vars (MonoCompile *cfg) case ArgInIReg: case ArgInFloatSSEReg: case ArgInDoubleSSEReg: + case ArgSIMDInSSEReg: cfg->ret->opcode = OP_REGVAR; cfg->ret->inst_c0 = cinfo->ret.reg; cfg->ret->dreg = cinfo->ret.reg; @@ -2159,6 +2166,7 @@ mono_arch_get_llvm_call_info (MonoCompile *cfg, MonoMethodSignature *sig) case ArgInIReg: case ArgInFloatSSEReg: case ArgInDoubleSSEReg: + case ArgSIMDInSSEReg: linfo->ret.storage = LLVMArgNormal; break; case ArgValuetypeInReg: { @@ -2435,6 +2443,11 @@ mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call) mono_call_inst_add_outarg_reg (cfg, call, vtarg->dreg, cinfo->ret.reg, FALSE); break; } + case ArgSIMDInSSEReg: + call->dont_decompose = TRUE; + g_assert (call->vret_var); + NULLIFY_INS (call->vret_var); + break; default: break; } @@ -4244,6 +4257,12 @@ emit_move_return_value (MonoCompile *cfg, MonoInst *ins, guint8 *code) if (ins->dreg != AMD64_XMM0) amd64_sse_movss_reg_reg (code, ins->dreg, AMD64_XMM0); break; + case OP_XCALL: + case OP_XCALL_REG: + case OP_XCALL_MEMBASE: + if (ins->dreg != AMD64_XMM0) + amd64_sse_movaps_reg_reg (code, ins->dreg, AMD64_XMM0); + break; case OP_VCALL: case OP_VCALL_REG: case OP_VCALL_MEMBASE: @@ -4277,6 +4296,13 @@ emit_move_return_value (MonoCompile *cfg, MonoInst *ins, guint8 *code) } } break; + case OP_VOIDCALL: + case OP_VOIDCALL_REG: + case OP_VOIDCALL_MEMBASE: + break; + default: + g_assert_not_reached (); + break; } return code; @@ -5538,6 +5564,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) case OP_VCALL: case OP_VCALL2: case OP_VOIDCALL: + case OP_XCALL: call = (MonoCallInst*)ins; code = amd64_handle_varargs_call (cfg, code, call, FALSE); @@ -5553,6 +5580,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) case OP_VCALL2_REG: case OP_VOIDCALL_REG: case OP_CALL_REG: + case OP_XCALL_REG: call = (MonoCallInst*)ins; if (AMD64_IS_ARGUMENT_REG (ins->sreg1)) { @@ -5573,6 +5601,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) case OP_VCALL2_MEMBASE: case OP_VOIDCALL_MEMBASE: case OP_CALL_MEMBASE: + case OP_XCALL_MEMBASE: call = (MonoCallInst*)ins; amd64_call_membase (code, ins->sreg1, ins->inst_offset); @@ -7518,7 +7547,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) amd64_sse_movaps_reg_reg (code, ins->dreg, ins->sreg1); break; case OP_XMOVE_ARG: - amd64_sse_movaps_reg_reg (code, ins->dreg, ins->sreg1); + if (ins->dreg != ins->sreg1) + amd64_sse_movaps_reg_reg (code, ins->dreg, ins->sreg1); break; case OP_XZERO: amd64_sse_pxor_reg_reg (code, ins->dreg, ins->dreg); diff --git a/src/mono/mono/mini/mini.h b/src/mono/mono/mini/mini.h index 825582a587c1e5..5afc6355909a77 100644 --- a/src/mono/mono/mini/mini.h +++ b/src/mono/mono/mini/mini.h @@ -818,6 +818,7 @@ struct MonoCallInst { guint32 rgctx_reg : 1; /* Whenever the call will need an unbox trampoline */ guint need_unbox_trampoline : 1; + guint dont_decompose: 1; }; struct MonoCallArgParm { From e81c022594b2470f581bff42562c0c2ce2716544 Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Wed, 18 Oct 2023 20:58:04 -0400 Subject: [PATCH 5/6] Implement dyncall support. --- src/mono/mono/mini/cpu-amd64.mdesc | 2 +- src/mono/mono/mini/mini-amd64.c | 50 ++++++++++++++++++++---------- src/mono/mono/mini/mini-amd64.h | 3 +- 3 files changed, 36 insertions(+), 19 deletions(-) diff --git a/src/mono/mono/mini/cpu-amd64.mdesc b/src/mono/mono/mini/cpu-amd64.mdesc index edc96b80ff6726..f01ae166255317 100644 --- a/src/mono/mono/mini/cpu-amd64.mdesc +++ b/src/mono/mono/mini/cpu-amd64.mdesc @@ -600,7 +600,7 @@ vcall2: len:64 clob:c vcall2_reg: src1:i len:64 clob:c vcall2_membase: src1:b len:64 clob:c -dyn_call: src1:i src2:i len:192 clob:c +dyn_call: src1:i src2:i len:252 clob:c localloc_imm: dest:i len:120 diff --git a/src/mono/mono/mini/mini-amd64.c b/src/mono/mono/mini/mini-amd64.c index 100b4d58be24ff..2e00c6f8fa710b 100644 --- a/src/mono/mono/mini/mini-amd64.c +++ b/src/mono/mono/mini/mini-amd64.c @@ -1128,7 +1128,7 @@ arg_get_storage (CallContext *ccontext, ArgInfo *ainfo) return &ccontext->gregs [ainfo->reg]; case ArgInFloatSSEReg: case ArgInDoubleSSEReg: - return &ccontext->fregs [ainfo->reg]; + return &ccontext->fregs [ainfo->reg * 2]; case ArgOnStack: case ArgValuetypeAddrOnStack: return ccontext->stack + ainfo->offset; @@ -1144,7 +1144,7 @@ arg_get_storage (CallContext *ccontext, ArgInfo *ainfo) return &ccontext->gregs [ainfo->pair_regs [0]]; case ArgInFloatSSEReg: case ArgInDoubleSSEReg: - return &ccontext->fregs [ainfo->pair_regs [0]]; + return &ccontext->fregs [ainfo->pair_regs [0] * 2]; default: g_assert_not_reached (); } @@ -1172,7 +1172,7 @@ arg_get_val (CallContext *ccontext, ArgInfo *ainfo, gpointer dest) break; case ArgInFloatSSEReg: case ArgInDoubleSSEReg: - *(double*)dest_cast = ccontext->fregs [reg_storage]; + *(double*)dest_cast = ccontext->fregs [reg_storage * 2]; break; default: g_assert_not_reached (); @@ -1196,7 +1196,7 @@ arg_set_val (CallContext *ccontext, ArgInfo *ainfo, gpointer src) break; case ArgInFloatSSEReg: case ArgInDoubleSSEReg: - ccontext->fregs [reg_storage] = *(double*)src_cast; + ccontext->fregs [reg_storage * 2] = *(double*)src_cast; break; default: g_assert_not_reached (); @@ -2612,6 +2612,7 @@ dyn_call_supported (MonoMethodSignature *sig, CallInfo *cinfo) case ArgInIReg: case ArgInFloatSSEReg: case ArgInDoubleSSEReg: + case ArgSIMDInSSEReg: case ArgValuetypeAddrInIReg: case ArgValuetypeInReg: break; @@ -2625,6 +2626,7 @@ dyn_call_supported (MonoMethodSignature *sig, CallInfo *cinfo) case ArgInIReg: case ArgInFloatSSEReg: case ArgInDoubleSSEReg: + case ArgSIMDInSSEReg: case ArgValuetypeInReg: case ArgValuetypeAddrInIReg: case ArgValuetypeAddrOnStack: @@ -2740,7 +2742,8 @@ mono_arch_start_dyn_call (MonoDynCallInfo *info, gpointer **args, guint8 *ret, g for (i = 0; i < PARAM_REGS; ++i) general_param_reg_to_index [param_regs[i]] = i; for (i = 0; i < FLOAT_PARAM_REGS; ++i) - float_param_reg_to_index [float_param_regs[i]] = i; + /* 2 entries per SIMD reg */ + float_param_reg_to_index [float_param_regs[i]] = i * 2; mono_memory_barrier (); param_reg_to_index_inited = 1; } else { @@ -2775,7 +2778,7 @@ mono_arch_start_dyn_call (MonoDynCallInfo *info, gpointer **args, guint8 *ret, g } else if (ainfo->storage == ArgValuetypeAddrInIReg) { g_assert (ainfo->pair_storage [0] == ArgInIReg && ainfo->pair_storage [1] == ArgNone); slot = general_param_reg_to_index [ainfo->pair_regs [0]]; - } else if (ainfo->storage == ArgInFloatSSEReg || ainfo->storage == ArgInDoubleSSEReg) { + } else if (ainfo->storage == ArgInFloatSSEReg || ainfo->storage == ArgInDoubleSSEReg || ainfo->storage == ArgSIMDInSSEReg) { slot = float_param_reg_to_index [ainfo->reg]; } else { slot = general_param_reg_to_index [ainfo->reg]; @@ -2887,6 +2890,10 @@ mono_arch_start_dyn_call (MonoDynCallInfo *info, gpointer **args, guint8 *ret, g for (i = 0; i < ainfo->arg_size / 8; ++i) p->regs [slot + i] = ((target_mgreg_t*)(arg))[i]; break; + case ArgSIMDInSSEReg: + p->has_fp = 1; + memcpy (&(p->fregs [slot]), arg, 16); + break; default: g_assert_not_reached (); break; @@ -2966,21 +2973,24 @@ mono_arch_finish_dyn_call (MonoDynCallInfo *info, guint8 *buf) } else { /* Fall through */ } - case MONO_TYPE_VALUETYPE: - if (dinfo->cinfo->ret.storage == ArgValuetypeAddrInIReg || dinfo->cinfo->ret.storage == ArgGsharedvtVariableInReg) { + case MONO_TYPE_VALUETYPE: { + ArgInfo *ainfo = &dinfo->cinfo->ret; + switch (ainfo->storage) { + case ArgValuetypeAddrInIReg: + case ArgGsharedvtVariableInReg: /* Nothing to do */ - } else { - ArgInfo *ainfo = &dinfo->cinfo->ret; - - g_assert (ainfo->storage == ArgValuetypeInReg); - + break; + case ArgSIMDInSSEReg: + memcpy (ret, &dargs->fregs [0], 16); + break; + case ArgValuetypeInReg: { for (i = 0; i < 2; ++i) { switch (ainfo->pair_storage [0]) { case ArgInIReg: ((host_mgreg_t*)ret)[i] = res; break; case ArgInDoubleSSEReg: - ((double*)ret)[i] = dargs->fregs [i]; + ((double*)ret)[i] = dargs->fregs [i * 2]; break; case ArgNone: break; @@ -2989,8 +2999,14 @@ mono_arch_finish_dyn_call (MonoDynCallInfo *info, guint8 *buf) break; } } + break; + } + default: + g_assert_not_reached (); + break; } break; + } default: g_assert_not_reached (); } @@ -5631,7 +5647,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) label = code; amd64_branch8 (code, X86_CC_Z, -1, 1); for (i = 0; i < FLOAT_PARAM_REGS; ++i) - amd64_sse_movsd_reg_membase (code, i, AMD64_R11, MONO_STRUCT_OFFSET (DynCallArgs, fregs) + (i * sizeof (double))); + amd64_sse_movups_reg_membase (code, i, AMD64_R11, MONO_STRUCT_OFFSET (DynCallArgs, fregs) + (i * 2 * sizeof (double))); amd64_patch (label, code); /* Allocate param area */ @@ -5676,8 +5692,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) /* Save result */ amd64_mov_reg_membase (code, AMD64_R11, var->inst_basereg, var->inst_offset, 8); amd64_mov_membase_reg (code, AMD64_R11, MONO_STRUCT_OFFSET (DynCallArgs, res), AMD64_RAX, 8); - amd64_sse_movsd_membase_reg (code, AMD64_R11, MONO_STRUCT_OFFSET (DynCallArgs, fregs), AMD64_XMM0); - amd64_sse_movsd_membase_reg (code, AMD64_R11, MONO_STRUCT_OFFSET (DynCallArgs, fregs) + sizeof (double), AMD64_XMM1); + amd64_sse_movups_membase_reg (code, AMD64_R11, MONO_STRUCT_OFFSET (DynCallArgs, fregs), AMD64_XMM0); + amd64_sse_movups_membase_reg (code, AMD64_R11, MONO_STRUCT_OFFSET (DynCallArgs, fregs) + (sizeof (double) * 2), AMD64_XMM1); break; } case OP_AMD64_SAVE_SP_TO_LMF: { diff --git a/src/mono/mono/mini/mini-amd64.h b/src/mono/mono/mini/mini-amd64.h index def055b0b0cd18..9fcc081431fbd7 100644 --- a/src/mono/mono/mini/mini-amd64.h +++ b/src/mono/mono/mini/mini-amd64.h @@ -277,7 +277,8 @@ struct SeqPointInfo { typedef struct { host_mgreg_t res; guint8 *ret; - double fregs [8]; + /* 2 entries per SIMD reg */ + double fregs [16]; host_mgreg_t has_fp; host_mgreg_t nstack_args; /* This should come last as the structure is dynamically extended */ From d89493a9617fe30c987b0d5903c98ddcd3319613 Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Wed, 25 Oct 2023 14:12:02 -0700 Subject: [PATCH 6/6] Implement gsharedvt-in support. --- src/mono/mono/mini/mini-amd64-gsharedvt.c | 34 +++++++++++++--------- src/mono/mono/mini/mini-amd64-gsharedvt.h | 1 + src/mono/mono/mini/tramp-amd64-gsharedvt.c | 7 +++-- 3 files changed, 26 insertions(+), 16 deletions(-) diff --git a/src/mono/mono/mini/mini-amd64-gsharedvt.c b/src/mono/mono/mini/mini-amd64-gsharedvt.c index 0aa2f1559006b3..d839cf858276f6 100644 --- a/src/mono/mono/mini/mini-amd64-gsharedvt.c +++ b/src/mono/mono/mini/mini-amd64-gsharedvt.c @@ -163,6 +163,7 @@ get_arg_slots (ArgInfo *ainfo, int **out_slots, gboolean is_source_argument) break; case ArgInDoubleSSEReg: case ArgInFloatSSEReg: + case ArgSIMDInSSEReg: nsrc = 1; src = g_malloc (nsrc * sizeof (int)); src [0] = map_freg (sreg); @@ -224,20 +225,21 @@ static void handle_marshal_when_dst_gsharedvt (ArgInfo *src_info, int *arg_marshal) { switch (src_info->storage) { - case ArgInIReg: - case ArgInDoubleSSEReg: - case ArgInFloatSSEReg: - case ArgValuetypeInReg: - case ArgOnStack: - *arg_marshal = GSHAREDVT_ARG_BYVAL_TO_BYREF; - break; - case ArgValuetypeAddrInIReg: - case ArgValuetypeAddrOnStack: - *arg_marshal = GSHAREDVT_ARG_NONE; - break; - default: - NOT_IMPLEMENTED; // See above - break; + case ArgInIReg: + case ArgInDoubleSSEReg: + case ArgInFloatSSEReg: + case ArgSIMDInSSEReg: + case ArgValuetypeInReg: + case ArgOnStack: + *arg_marshal = GSHAREDVT_ARG_BYVAL_TO_BYREF; + break; + case ArgValuetypeAddrInIReg: + case ArgValuetypeAddrOnStack: + *arg_marshal = GSHAREDVT_ARG_NONE; + break; + default: + NOT_IMPLEMENTED; // See above + break; } } @@ -331,6 +333,7 @@ mono_arch_get_gsharedvt_call_info (MonoMemoryManager *mem_manager, gpointer addr case ArgInIReg: case ArgInDoubleSSEReg: case ArgInFloatSSEReg: + case ArgSIMDInSSEReg: case ArgValuetypeInReg: case ArgOnStack: nsrc = get_arg_slots (src_info, &src, TRUE); @@ -500,6 +503,9 @@ mono_arch_get_gsharedvt_call_info (MonoMemoryManager *mem_manager, gpointer addr case ArgInFloatSSEReg: info->ret_marshal = GSHAREDVT_RET_R8; break; + case ArgSIMDInSSEReg: + info->ret_marshal = GSHAREDVT_RET_SIMD; + break; case ArgValuetypeAddrInIReg: break; default: diff --git a/src/mono/mono/mini/mini-amd64-gsharedvt.h b/src/mono/mono/mini/mini-amd64-gsharedvt.h index 539dff9ff51e16..fea5d385b7d968 100644 --- a/src/mono/mono/mini/mini-amd64-gsharedvt.h +++ b/src/mono/mono/mini/mini-amd64-gsharedvt.h @@ -37,6 +37,7 @@ typedef enum { GSHAREDVT_RET_I8, // 8 byte integer GSHAREDVT_RET_IREGS_1, // Load in first return register GSHAREDVT_RET_R8, // Double + GSHAREDVT_RET_SIMD, // SIMD GSHAREDVT_RET_NUM, } GSharedVtRetMarshal; diff --git a/src/mono/mono/mini/tramp-amd64-gsharedvt.c b/src/mono/mono/mini/tramp-amd64-gsharedvt.c index 43b575b9f3d7dd..57860c794829d5 100644 --- a/src/mono/mono/mini/tramp-amd64-gsharedvt.c +++ b/src/mono/mono/mini/tramp-amd64-gsharedvt.c @@ -224,7 +224,7 @@ mono_arch_get_gsharedvt_trampoline (MonoTrampInfo **info, gboolean aot) /*callconv in regs */ caller_reg_area_offset = offset; - reg_area_size = ALIGN_TO ((n_arg_regs + n_arg_fregs) * 8, MONO_ARCH_FRAME_ALIGNMENT); + reg_area_size = ALIGN_TO ((n_arg_regs * 8) + (n_arg_fregs * 16), MONO_ARCH_FRAME_ALIGNMENT); offset += reg_area_size; framesize = offset; @@ -266,7 +266,7 @@ mono_arch_get_gsharedvt_trampoline (MonoTrampInfo **info, gboolean aot) amd64_mov_membase_reg (code, AMD64_RSP, caller_reg_area_offset + i * 8, param_regs [i], sizeof (target_mgreg_t)); for (i = 0; i < n_arg_fregs; ++i) - amd64_sse_movsd_membase_reg (code, AMD64_RSP, caller_reg_area_offset + (i + n_arg_regs) * 8, i); + amd64_sse_movups_membase_reg (code, AMD64_RSP, caller_reg_area_offset + (n_arg_regs * 8) + (i * 16), i); /* TODO Allocate stack area used to pass arguments to the method */ @@ -411,6 +411,9 @@ mono_arch_get_gsharedvt_trampoline (MonoTrampInfo **info, gboolean aot) case GSHAREDVT_RET_R8: amd64_sse_movsd_reg_membase (code, AMD64_XMM0, AMD64_R11, 0); break; + case GSHAREDVT_RET_SIMD: + amd64_sse_movups_reg_membase (code, AMD64_XMM0, AMD64_R11, 0); + break; default: x86_breakpoint (code); /* can't handle specific case */ }