diff --git a/src/mono/mono/arch/amd64/amd64-codegen.h b/src/mono/mono/arch/amd64/amd64-codegen.h index 9ac73b9853c466..5ebb5ae5c38f48 100644 --- a/src/mono/mono/arch/amd64/amd64-codegen.h +++ b/src/mono/mono/arch/amd64/amd64-codegen.h @@ -776,6 +776,13 @@ typedef union { amd64_codegen_post(inst); \ } while (0) +#define emit_sse_reg_reg_op4_size_imm(inst,dreg,reg,op1,op2,op3,op4,size,imm) do { \ + amd64_codegen_pre(inst); \ + emit_sse_reg_reg_op4_size ((inst), (dreg), (reg), (op1), (op2), (op3), (op4), (size)); \ + x86_imm_emit8 ((inst), (imm)); \ + amd64_codegen_post(inst); \ +} while (0) + /* specific SSE opcode defines */ #define amd64_sse_xorpd_reg_reg(inst,dreg,reg) emit_sse_reg_reg ((inst),(dreg),(reg), 0x66, 0x0f, 0x57) @@ -836,11 +843,16 @@ typedef union { #define amd64_sse_sqrtsd_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0xf2, 0x0f, 0x51) - +#define amd64_sse_pinsrb_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_op4_imm ((inst), (dreg), (reg), 0x66, 0x0f, 0x3a, 0x20, (imm)) +#define amd64_sse_pinsrd_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_op4_imm ((inst), (dreg), (reg), 0x66, 0x0f, 0x3a, 0x22, (imm)) +#define amd64_sse_pinsrq_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_op4_size_imm ((inst), (dreg), (reg), 0x66, 0x0f, 0x3a, 0x22, 8, (imm)) #define amd64_sse_pinsrw_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_imm ((inst), (dreg), (reg), 0x66, 0x0f, 0xc4, (imm)) +#define amd64_sse_insertps_reg_reg(inst,dreg,reg,imm) emit_sse_reg_reg_op4_imm ((inst), (dreg), (reg), 0x66, 0x0f, 0x3a, 0x21, (imm)) +#define amd64_sse_pblendw_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_op4_imm ((inst), (dreg), (reg), 0x66, 0x0f, 0x3a, 0x0e, (imm)) #define amd64_sse_pextrw_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_imm ((inst), (dreg), (reg), 0x66, 0x0f, 0xc5, (imm)) - +#define amd64_sse_pextrb_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_op4_imm ((inst), (reg), (dreg), 0x66, 0x0f, 0x3a, 0x14, (imm)) +#define amd64_sse_pextrd_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_op4_imm ((inst), (reg), (dreg), 0x66, 0x0f, 0x3a, 0x16, (imm)) #define amd64_sse_cvttsd2si_reg_xreg_size(inst,reg,xreg,size) emit_sse_reg_reg_size ((inst), (reg), (xreg), 0xf2, 0x0f, 0x2c, (size)) @@ -894,6 +906,8 @@ typedef union { #define amd64_sse_shufpd_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_imm((inst), (dreg), (reg), 0x66, 0x0f, 0xC6, (imm)) +#define amd64_sse_roundps_reg_reg_imm(inst, dreg, reg, imm) emit_sse_reg_reg_op4_imm((inst), (dreg), (reg), 0x66, 0x0f, 0x3a, 0x08, (imm)) + #define amd64_sse_roundpd_reg_reg_imm(inst, dreg, reg, imm) emit_sse_reg_reg_op4_imm((inst), (dreg), (reg), 0x66, 0x0f, 0x3a, 0x09, (imm)) #define amd64_sse_addpd_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x58) @@ -1169,6 +1183,14 @@ typedef union { #define amd64_sse_lzcnt_reg_reg_size(inst, dreg, reg, size) emit_sse_reg_reg_size((inst), (dreg), (reg), 0xf3, 0x0f, 0xbd, (size)) #define amd64_sse_popcnt_reg_reg_size(inst, dreg, reg, size) emit_sse_reg_reg_size((inst), (dreg), (reg), 0xf3, 0x0f, 0xb8, (size)) +#define amd64_sse_psrlq_reg_imm(inst, reg, imm) emit_sse_reg_reg_imm((inst), X86_SSE_SHR, (reg), 0x66, 0x0f, 0x73, (imm)) +#define amd64_sse_dpps_reg_reg(inst, dreg, sreg, mask) emit_sse_reg_reg_op4_imm((inst), (dreg), (sreg), 0x66, 0x0f, 0x3a, 0x40, (mask)) +#define amd64_sse_dppd_reg_reg(inst, dreg, sreg, mask) emit_sse_reg_reg_op4_imm((inst), (dreg), (sreg), 0x66, 0x0f, 0x3a, 0x41, (mask)) +#define amd64_sse_phaddw_reg_reg(inst, dreg, sreg) emit_sse_reg_reg_op4((inst), (dreg), (sreg), 0x66, 0x0f, 0x38, 0x01) +#define amd64_sse_phaddd_reg_reg(inst, dreg, sreg) emit_sse_reg_reg_op4((inst), (dreg), (sreg), 0x66, 0x0f, 0x38, 0x02) +#define amd64_sse_blendpd_reg_reg(inst,dreg,sreg,imm) emit_sse_reg_reg_op4_imm((inst), (dreg), (sreg), 0x66, 0x0f, 0x3a, 0x0d, (imm)) +#define amd64_movq_reg_reg(inst,dreg,sreg) emit_sse_reg_reg ((inst), (dreg), (sreg), 0xf3, 0x0f, 0x7e) + /* Generated from x86-codegen.h */ #define amd64_breakpoint_size(inst,size) do { x86_breakpoint(inst); } while (0) diff --git a/src/mono/mono/mini/cpu-amd64.mdesc b/src/mono/mono/mini/cpu-amd64.mdesc index f6eb2d52494adb..0842ac8b470b63 100644 --- a/src/mono/mono/mini/cpu-amd64.mdesc +++ b/src/mono/mono/mini/cpu-amd64.mdesc @@ -600,6 +600,15 @@ loadu2_mem: dest:i len:16 #SIMD +xbinop: dest:x src1:x src2:x len:7 clob:1 +xbinop_forceint: dest:x src1:x src2:x len:7 clob:1 +ones_complement: dest:x src1:x len:16 clob:1 +negate: dest:x src1:x len:24 clob:1 +xlower: dest:x src1:x len:16 +xupper: dest:x src1:x len:16 +convert_fp_to_si: dest:x src1:x len:16 +convert_si_to_fp: dest:x src1:x len:16 + addps: dest:x src1:x src2:x len:4 clob:1 divps: dest:x src1:x src2:x len:4 clob:1 mulps: dest:x src1:x src2:x len:4 clob:1 @@ -772,17 +781,23 @@ xones: dest:x len:5 xconst: dest:x len:12 iconv_to_x: dest:x src1:i len:5 -extract_i4: dest:i src1:x len:5 +extract_i4: dest:i src1:x len:16 extract_i8: dest:i src1:x len:9 - extract_i2: dest:i src1:x len:13 extract_i1: dest:i src1:x len:13 extract_r8: dest:f src1:x len:5 +extract_r4: dest:f src1:x len:24 +xextract: dest:i src1:x len:24 iconv_to_r4_raw: dest:f src1:i len:10 +insert_i1: dest:x src1:x src2:i len:7 clob:1 insert_i2: dest:x src1:x src2:i len:6 clob:1 +insert_i4: dest:x src1:x src2:i len:7 clob:1 +insert_i8: dest:x src1:x src2:i len:7 clob:1 +insert_r4: dest:x src1:x src2:f len:7 clob:1 +insert_r8: dest:x src1:x src2:f len:24 clob:1 extractx_u2: dest:i src1:x len:6 insertx_u1_slow: dest:x src1:i src2:i len:18 clob:x @@ -810,6 +825,11 @@ expand_i4: dest:x src1:i len:11 expand_i8: dest:x src1:i len:11 expand_r4: dest:x src1:f len:16 expand_r8: dest:x src1:f len:13 +xop_x_x_x: dest:x src1:x src2:x len:16 clob:1 +xop_x_x: dest:x src1:x len:16 clob:1 +sse41_dpps_imm: dest:x src1:x src2:x len:7 clob:1 +sse41_dppd_imm: dest:x src1:x src2:x len:7 clob:1 +vector_andnot: dest:x src1:x src2:x len:7 clob:1 roundp: dest:x src1:x len:10 diff --git a/src/mono/mono/mini/ir-emit.h b/src/mono/mono/mini/ir-emit.h index 6c0b84213eb491..6b88e959e3852f 100644 --- a/src/mono/mono/mini/ir-emit.h +++ b/src/mono/mono/mini/ir-emit.h @@ -886,7 +886,7 @@ static int ccount = 0; cfg->flags |= MONO_CFG_HAS_CHECK_THIS; \ MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, (reg), 0); \ MONO_EMIT_NEW_COND_EXC (cfg, EQ, "NullReferenceException"); \ - MONO_EMIT_NEW_UNALU (cfg, OP_NOT_NULL, -1, reg); \ + if (COMPILE_LLVM (cfg)) MONO_EMIT_NEW_UNALU (cfg, OP_NOT_NULL, -1, reg); \ } while (0) /* Emit an explicit null check which doesn't depend on SIGSEGV signal handling */ @@ -897,7 +897,7 @@ static int ccount = 0; } else { \ MONO_EMIT_NEW_IMPLICIT_EXCEPTION_LOAD_STORE (cfg); \ } \ - MONO_EMIT_NEW_UNALU (cfg, OP_NOT_NULL, -1, reg); \ + if (COMPILE_LLVM (cfg)) MONO_EMIT_NEW_UNALU (cfg, OP_NOT_NULL, -1, reg); \ } while (0) #define MONO_EMIT_NEW_CHECK_THIS(cfg, sreg) do { \ @@ -907,7 +907,7 @@ static int ccount = 0; } else { \ MONO_EMIT_NEW_UNALU (cfg, OP_CHECK_THIS, -1, sreg); \ MONO_EMIT_NEW_IMPLICIT_EXCEPTION_LOAD_STORE (cfg); \ - MONO_EMIT_NEW_UNALU (cfg, OP_NOT_NULL, -1, sreg); \ + if (COMPILE_LLVM (cfg)) MONO_EMIT_NEW_UNALU (cfg, OP_NOT_NULL, -1, sreg); \ } \ } while (0) diff --git a/src/mono/mono/mini/mini-amd64.c b/src/mono/mono/mini/mini-amd64.c index 12e72cc29cd4d0..9b7c90e45a13f5 100644 --- a/src/mono/mono/mini/mini-amd64.c +++ b/src/mono/mono/mini/mini-amd64.c @@ -48,6 +48,7 @@ #include "mini-gc.h" #include "mini-runtime.h" #include "aot-runtime.h" +#include "llvm-intrinsics-types.h" MONO_DISABLE_WARNING(4127) /* conditional expression is constant */ @@ -67,6 +68,7 @@ static gpointer bp_trampoline; /* Offset between fp and the first argument in the callee */ #define ARGS_OFFSET 16 #define GP_SCRATCH_REG AMD64_R11 +#define SIMD_TEMP_REG MONO_ARCH_FP_SCRATCH_REG /* Max number of bblocks before we bail from using more advanced branch placement code */ #define MAX_BBLOCKS_FOR_BRANCH_OPTS 800 @@ -4015,7 +4017,7 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb) ins->opcode = ins->inst_c1 == MONO_TYPE_R8 ? OP_MAXPD : OP_MAXPS; break; default: - g_assert_not_reached(); + // Handled in mono_arch_output_basic_block () break; } break; @@ -6033,8 +6035,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) if (cfg->compile_aot && cfg->code_exec_only) { mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_R8_GOT, &r8_0); amd64_mov_reg_membase (code, AMD64_R11, AMD64_RIP, 0, sizeof (target_mgreg_t)); - amd64_sse_movsd_reg_membase (code, MONO_ARCH_FP_SCRATCH_REG, AMD64_R11, 0); - amd64_sse_xorpd_reg_reg (code, ins->dreg, MONO_ARCH_FP_SCRATCH_REG); + amd64_sse_movsd_reg_membase (code, SIMD_TEMP_REG, AMD64_R11, 0); + amd64_sse_xorpd_reg_reg (code, ins->dreg, SIMD_TEMP_REG); } else { mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_R8, &r8_0); amd64_sse_xorpd_reg_membase (code, ins->dreg, AMD64_RIP, 0); @@ -6049,8 +6051,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) if (cfg->compile_aot && cfg->code_exec_only) { mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_R8_GOT, &d); amd64_mov_reg_membase (code, AMD64_R11, AMD64_RIP, 0, sizeof (target_mgreg_t)); - amd64_sse_movsd_reg_membase (code, MONO_ARCH_FP_SCRATCH_REG, AMD64_R11, 0); - amd64_sse_andpd_reg_reg (code, ins->dreg, MONO_ARCH_FP_SCRATCH_REG); + amd64_sse_movsd_reg_membase (code, SIMD_TEMP_REG, AMD64_R11, 0); + amd64_sse_andpd_reg_reg (code, ins->dreg, SIMD_TEMP_REG); } else { mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_R8, &d); amd64_sse_andpd_reg_membase (code, ins->dreg, AMD64_RIP, 0); @@ -6081,13 +6083,13 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) if (cfg->compile_aot && cfg->code_exec_only) { mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_R4_GOT, &r4_0); amd64_mov_reg_membase (code, AMD64_R11, AMD64_RIP, 0, sizeof (target_mgreg_t)); - amd64_sse_movss_reg_membase (code, MONO_ARCH_FP_SCRATCH_REG, AMD64_R11, 0); + amd64_sse_movss_reg_membase (code, SIMD_TEMP_REG, AMD64_R11, 0); } else { mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_R4, &r4_0); - amd64_sse_movss_reg_membase (code, MONO_ARCH_FP_SCRATCH_REG, AMD64_RIP, 0); + amd64_sse_movss_reg_membase (code, SIMD_TEMP_REG, AMD64_RIP, 0); } - amd64_sse_xorps_reg_reg (code, ins->dreg, MONO_ARCH_FP_SCRATCH_REG); + amd64_sse_xorps_reg_reg (code, ins->dreg, SIMD_TEMP_REG); break; } @@ -6674,6 +6676,134 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) } #ifdef MONO_ARCH_SIMD_INTRINSICS /* TODO: Some of these IR opcodes are marked as no clobber when they indeed do. */ + case OP_XBINOP: { + switch (ins->inst_c0) { + case OP_IMUL: + switch (ins->inst_c1) { + case MONO_TYPE_I8: + case MONO_TYPE_U8: + amd64_sse_pmuludq_reg_reg (code, ins->sreg1, ins->sreg2); + break; + default: + g_assert_not_reached (); + break; + } + break; + default: + g_assert_not_reached (); + break; + } + break; + } + case OP_XBINOP_FORCEINT: { + switch (ins->inst_c0) { + case XBINOP_FORCEINT_AND: + amd64_sse_andpd_reg_reg (code, ins->sreg1, ins->sreg2); + break; + case XBINOP_FORCEINT_OR: + amd64_sse_orpd_reg_reg (code, ins->sreg1, ins->sreg2); + break; + case XBINOP_FORCEINT_XOR: + amd64_sse_xorpd_reg_reg (code, ins->sreg1, ins->sreg2); + break; + default: + g_assert_not_reached (); + break; + } + break; + } + case OP_XOP_X_X_X: { + switch (ins->inst_c0) { + case INTRINS_SSE_PHADDW: + amd64_sse_phaddw_reg_reg (code, ins->dreg, ins->sreg2); + break; + case INTRINS_SSE_PHADDD: + amd64_sse_phaddd_reg_reg (code, ins->dreg, ins->sreg2); + break; + case INTRINS_SSE_HADDPS: + amd64_sse_haddps_reg_reg (code, ins->dreg, ins->sreg2); + break; + case INTRINS_SSE_HADDPD: + amd64_sse_haddpd_reg_reg (code, ins->dreg, ins->sreg2); + break; + default: + g_assert_not_reached (); + break; + } + break; + } + case OP_XOP_X_X: { + switch (ins->inst_c0) { + case INTRINS_SIMD_SQRT_R4: + amd64_sse_sqrtps_reg_reg (code, ins->dreg, ins->sreg1); + break; + case INTRINS_SIMD_SQRT_R8: + amd64_sse_sqrtpd_reg_reg (code, ins->dreg, ins->sreg1); + break; + default: + g_assert_not_reached (); + break; + } + break; + } + case OP_SSE41_DPPS_IMM: + amd64_sse_dpps_reg_reg (code, ins->dreg, ins->sreg2, ins->inst_c0); + break; + case OP_SSE41_DPPD_IMM: + amd64_sse_dppd_reg_reg (code, ins->dreg, ins->sreg2, ins->inst_c0); + break; + case OP_ONES_COMPLEMENT: + amd64_sse_pcmpeqd_reg_reg (code, SIMD_TEMP_REG, SIMD_TEMP_REG); + amd64_sse_pxor_reg_reg (code, ins->dreg, SIMD_TEMP_REG); + break; + case OP_NEGATION: { + switch (ins->inst_c1) { + case MONO_TYPE_I1: + case MONO_TYPE_U1: + amd64_sse_pxor_reg_reg (code, SIMD_TEMP_REG, SIMD_TEMP_REG); + amd64_sse_psubb_reg_reg (code, SIMD_TEMP_REG, ins->sreg1); + amd64_sse_movaps_reg_reg (code, ins->dreg, SIMD_TEMP_REG); + break; + case MONO_TYPE_I2: + case MONO_TYPE_U2: + amd64_sse_pxor_reg_reg (code, SIMD_TEMP_REG, SIMD_TEMP_REG); + amd64_sse_psubw_reg_reg (code, SIMD_TEMP_REG, ins->sreg1); + amd64_sse_movaps_reg_reg (code, ins->dreg, SIMD_TEMP_REG); + break; + case MONO_TYPE_I4: + case MONO_TYPE_U4: + amd64_sse_pxor_reg_reg (code, SIMD_TEMP_REG, SIMD_TEMP_REG); + amd64_sse_psubd_reg_reg (code, SIMD_TEMP_REG, ins->sreg1); + amd64_sse_movaps_reg_reg (code, ins->dreg, SIMD_TEMP_REG); + break; + case MONO_TYPE_I8: + case MONO_TYPE_U8: + amd64_sse_pxor_reg_reg (code, SIMD_TEMP_REG, SIMD_TEMP_REG); + amd64_sse_psubq_reg_reg (code, SIMD_TEMP_REG, ins->sreg1); + amd64_sse_movaps_reg_reg (code, ins->dreg, SIMD_TEMP_REG); + break; + case MONO_TYPE_R4: { + /* -0.0 */ + amd64_sse_pcmpeqw_reg_reg (code, SIMD_TEMP_REG, SIMD_TEMP_REG); + amd64_sse_pslld_reg_imm (code, SIMD_TEMP_REG, 31); + g_assert (ins->sreg1 == ins->dreg); + amd64_sse_xorps_reg_reg (code, ins->dreg, SIMD_TEMP_REG); + break; + } + case MONO_TYPE_R8: { + /* -0.0 */ + amd64_sse_pcmpeqw_reg_reg (code, SIMD_TEMP_REG, SIMD_TEMP_REG); + amd64_sse_psllq_reg_imm (code, SIMD_TEMP_REG, 63); + g_assert (ins->sreg1 == ins->dreg); + amd64_sse_xorps_reg_reg (code, ins->dreg, SIMD_TEMP_REG); + break; + } + default: + g_assert_not_reached (); + break; + } + break; + } case OP_ADDPS: amd64_sse_addps_reg_reg (code, ins->sreg1, ins->sreg2); break; @@ -6820,6 +6950,10 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) case OP_PXOR: amd64_sse_pxor_reg_reg (code, ins->sreg1, ins->sreg2); break; + case OP_VECTOR_ANDN: + g_assert (ins->dreg == ins->sreg1); + amd64_sse_pandn_reg_reg (code, ins->dreg, ins->sreg2); + break; case OP_PADDB: amd64_sse_paddb_reg_reg (code, ins->sreg1, ins->sreg2); @@ -7110,20 +7244,22 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) amd64_movd_xreg_reg_size (code, ins->dreg, ins->sreg1, 4); break; case OP_EXTRACT_I4: - amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 4); + if (ins->inst_c0) { + amd64_sse_pextrd_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0); + } else { + amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 4); + } break; case OP_EXTRACT_I8: if (ins->inst_c0) { - amd64_movhlps_reg_reg (code, MONO_ARCH_FP_SCRATCH_REG, ins->sreg1); - amd64_movd_reg_xreg_size (code, ins->dreg, MONO_ARCH_FP_SCRATCH_REG, 8); + amd64_movhlps_reg_reg (code, SIMD_TEMP_REG, ins->sreg1); + amd64_movd_reg_xreg_size (code, ins->dreg, SIMD_TEMP_REG, 8); } else { amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 8); } break; case OP_EXTRACT_I1: - amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 4); - if (ins->inst_c0) - amd64_shift_reg_imm (code, X86_SHR, ins->dreg, ins->inst_c0 * 8); + amd64_sse_pextrb_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0); amd64_widen_reg (code, ins->dreg, ins->dreg, ins->inst_c1 == MONO_TYPE_I1, FALSE); break; case OP_EXTRACT_I2: @@ -7139,9 +7275,44 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) else amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1); break; + case OP_EXTRACT_R4: { + if (ins->inst_c0 == 0) { + amd64_sse_movss_reg_reg (code, ins->dreg, ins->sreg1); + } else { + int imm = ins->inst_c0; + amd64_sse_movaps_reg_reg (code, SIMD_TEMP_REG, ins->sreg1); + amd64_sse_shufps_reg_reg_imm (code, SIMD_TEMP_REG, ins->sreg1, imm); + amd64_sse_pxor_reg_reg (code, ins->dreg, ins->dreg); + amd64_sse_movss_reg_reg (code, ins->dreg, SIMD_TEMP_REG); + } + break; + } + case OP_INSERT_I1: + amd64_sse_pinsrb_reg_reg_imm (code, ins->sreg1, ins->sreg2, ins->inst_c0); + break; case OP_INSERT_I2: amd64_sse_pinsrw_reg_reg_imm (code, ins->sreg1, ins->sreg2, ins->inst_c0); break; + case OP_INSERT_I4: + amd64_sse_pinsrd_reg_reg_imm (code, ins->sreg1, ins->sreg2, ins->inst_c0); + break; + case OP_INSERT_I8: + amd64_sse_pinsrq_reg_reg_imm (code, ins->sreg1, ins->sreg2, ins->inst_c0); + break; + case OP_INSERT_R4: { + guint8 imm = (0 << 6) | (ins->inst_c0 << 4); + amd64_sse_insertps_reg_reg (code, ins->sreg1, ins->sreg2, imm); + break; + } + case OP_INSERT_R8: { + if (ins->inst_c0 == 0) { + amd64_sse_blendpd_reg_reg (code, ins->dreg, ins->sreg2, 0b1); + } else { + g_assert (ins->inst_c0 == 1); + amd64_movlhps_reg_reg (code, ins->dreg, ins->sreg2); + } + break; + } case OP_EXTRACTX_U2: amd64_sse_pextrw_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0); break; @@ -7165,11 +7336,11 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) amd64_sse_pinsrw_reg_reg_imm (code, ins->dreg, ins->sreg2, ins->inst_c0 * 2 + 1); break; case OP_INSERTX_I8_SLOW: - amd64_movd_xreg_reg_size(code, MONO_ARCH_FP_SCRATCH_REG, ins->sreg2, 8); + amd64_movd_xreg_reg_size(code, SIMD_TEMP_REG, ins->sreg2, 8); if (ins->inst_c0) - amd64_movlhps_reg_reg (code, ins->dreg, MONO_ARCH_FP_SCRATCH_REG); + amd64_movlhps_reg_reg (code, ins->dreg, SIMD_TEMP_REG); else - amd64_sse_movsd_reg_reg (code, ins->dreg, MONO_ARCH_FP_SCRATCH_REG); + amd64_sse_movsd_reg_reg (code, ins->dreg, SIMD_TEMP_REG); break; case OP_INSERTX_R4_SLOW: @@ -7200,6 +7371,41 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) else amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg2); break; + case OP_XEXTRACT: { + /* Elements are either 0 or 0xff */ + g_assert (ins->inst_c1 == 16); + amd64_sse_pmovmskb_reg_reg (code, ins->dreg, ins->sreg1); + if (ins->inst_c0 == SIMD_EXTR_ARE_ALL_SET) { + /* dreg = (mask == 0xffff) */ + amd64_alu_reg_imm_size (code, X86_CMP, ins->dreg, 0xffff, 4); + amd64_set_reg (code, X86_CC_EQ, ins->dreg, FALSE); + amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE); + } else if (ins->inst_c0 == SIMD_EXTR_IS_ANY_SET) { + /* dreg = (mask != 0) */ + amd64_alu_reg_imm_size (code, X86_CMP, ins->dreg, 0, 4); + amd64_set_reg (code, X86_CC_NE, ins->dreg, FALSE); + amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE); + } else { + g_assert_not_reached (); + } + break; + } + case OP_XLOWER: + amd64_sse_pxor_reg_reg (code, SIMD_TEMP_REG, SIMD_TEMP_REG); + amd64_sse_pblendw_reg_reg_imm (code, SIMD_TEMP_REG, ins->sreg1, 0b1111); + amd64_sse_movaps_reg_reg (code, ins->dreg, SIMD_TEMP_REG); + break; + case OP_XUPPER: + amd64_sse_pxor_reg_reg (code, SIMD_TEMP_REG, SIMD_TEMP_REG); + amd64_movhlps_reg_reg (code, SIMD_TEMP_REG, ins->sreg1); + amd64_sse_movaps_reg_reg (code, ins->dreg, SIMD_TEMP_REG); + break; + case OP_CVT_FP_SI: + amd64_sse_cvttps2dq_reg_reg (code, ins->dreg, ins->sreg1); + break; + case OP_CVT_SI_FP: + amd64_sse_cvtdq2ps_reg_reg (code, ins->dreg, ins->sreg1); + break; case OP_STOREX_MEMBASE_REG: case OP_STOREX_MEMBASE: amd64_sse_movups_membase_reg (code, ins->dreg, ins->inst_offset, ins->sreg1); @@ -7293,7 +7499,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) if (ins->inst_c1 == MONO_TYPE_R8) amd64_sse_roundpd_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0); else - g_assert_not_reached (); // roundps, but it's not used anywhere for non-llvm back-end yet. + amd64_sse_roundps_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0); break; } #endif diff --git a/src/mono/mono/mini/mini-ops.h b/src/mono/mono/mini/mini-ops.h index b5bc5579b137f5..8db4ec672ab506 100644 --- a/src/mono/mono/mini/mini-ops.h +++ b/src/mono/mono/mini/mini-ops.h @@ -817,7 +817,10 @@ MINI_OP(OP_EXTRACT_R4, "extract_r4", FREG, XREG, NONE) MINI_OP(OP_EXTRACT_R8, "extract_r8", FREG, XREG, NONE) MINI_OP(OP_EXTRACTX_U2, "extractx_u2", IREG, XREG, NONE) -/* Used by LLVM */ +/* + * Insert an element into a vector with a constant lane index. + * inst_c0 is the lane index. + */ MINI_OP(OP_INSERT_I1, "insert_i1", XREG, XREG, IREG) MINI_OP(OP_INSERT_I2, "insert_i2", XREG, XREG, IREG) MINI_OP(OP_INSERT_I4, "insert_i4", XREG, XREG, IREG) @@ -1145,6 +1148,9 @@ MINI_OP(OP_SSE_CVTII, "sse_cvtii", XREG, XREG, NONE) MINI_OP3(OP_SSE41_DPPS, "sse41_dpps", XREG, XREG, XREG, IREG) MINI_OP3(OP_SSE41_DPPD, "sse41_dppd", XREG, XREG, XREG, IREG) MINI_OP3(OP_SSE41_MPSADBW, "sse41_mpsadbw", XREG, XREG, XREG, IREG) +/* inst_c0 contains the mask value */ +MINI_OP(OP_SSE41_DPPS_IMM, "sse41_dpps_imm", XREG, XREG, XREG) +MINI_OP(OP_SSE41_DPPD_IMM, "sse41_dppd_imm", XREG, XREG, XREG) /* pclmulqdq */ MINI_OP3(OP_PCLMULQDQ, "pclmulqdq", XREG, XREG, XREG, IREG) @@ -1505,15 +1511,23 @@ MINI_OP(OP_XCOMPARE_SCALAR, "xcompare_scalar", XREG, XREG, XREG) MINI_OP(OP_XCOMPARE_FP, "xcompare_fp", XREG, XREG, XREG) MINI_OP(OP_XCOMPARE_FP_SCALAR, "xcompare_fp_scalar", XREG, XREG, XREG) -/* Extract from XREG into IREG. - * inst_c0 - specific instruction, one of SIMD_EXTR_... */ +/* + * The input reg is the result ofg OP_XCOMPARE, i.e. + * every element is either 0 or 0xff. + * Compute an integer result based on whenever all or any + * bits are non-zero. + * inst_c0 - specific instruction, one of SIMD_EXTR_... + * inst_c1 - vector size in bytes + */ MINI_OP(OP_XEXTRACT, "xextract", IREG, XREG, NONE) /* * Generic SIMD operations, the rest of the JIT doesn't care about the exact operation. */ MINI_OP(OP_XUNOP, "xunop", XREG, XREG, NONE) +/* inst_c0 is a OP_ constant, inst_c1 is a MONO_TYPE_ constant */ MINI_OP(OP_XBINOP, "xbinop", XREG, XREG, XREG) +/* The arguments are treated as vectors of integer types. inst_c0 is a XBINOP_FORCEINT_ constant */ MINI_OP(OP_XBINOP_FORCEINT, "xbinop_forceint", XREG, XREG, XREG) MINI_OP(OP_XBINOP_SCALAR, "xbinop_scalar", XREG, XREG, XREG) MINI_OP(OP_XBINOP_BYSCALAR, "xbinop_byscalar", XREG, XREG, XREG) @@ -1545,7 +1559,9 @@ MINI_OP(OP_XOP_OVR_BYSCALAR_X_X_X, "xop_ovr_byscalar_x_x_x", XREG, XREG, XREG) MINI_OP(OP_XCONCAT, "xconcat", XREG, XREG, XREG) MINI_OP(OP_XCAST, "xcast", XREG, XREG, NONE) +/* Return a new vector containing the lower half of the source */ MINI_OP(OP_XLOWER, "xlower", XREG, XREG, NONE) +/* Return a new vector containing the upper half of the source */ MINI_OP(OP_XUPPER, "xupper", XREG, XREG, NONE) MINI_OP(OP_XWIDEN, "xwiden", XREG, XREG, NONE) MINI_OP(OP_XWIDEN_UNSAFE, "xwiden_unsafe", XREG, XREG, NONE) @@ -1827,8 +1843,10 @@ MINI_OP(OP_CVT_UI_FP, "convert_ui_to_fp", XREG, XREG, NONE) MINI_OP(OP_CVT_SI_FP, "convert_si_to_fp", XREG, XREG, NONE) MINI_OP(OP_CVT_UI_FP_SCALAR, "convert_ui_to_fp_scalar", XREG, XREG, NONE) MINI_OP(OP_CVT_SI_FP_SCALAR, "convert_si_to_fp_scalar", XREG, XREG, NONE) +/* inst_c1 is one of the MONO_TYPE_ constants */ MINI_OP(OP_NEGATION, "negate", XREG, XREG, NONE) MINI_OP(OP_NEGATION_SCALAR, "negate_scalar", XREG, XREG, NONE) +/* Select bits from src2/src3 using src1 */ MINI_OP3(OP_BSL, "bitwise_select", XREG, XREG, XREG, XREG) #endif // TARGET_ARM64 || TARGET_AMD64 || TARGET_WASM diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index eb208bb3a6a7bd..eb0f6e213b0434 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -380,15 +380,27 @@ emit_simd_ins_for_binary_op (MonoCompile *cfg, MonoClass *klass, MonoMethodSigna return NULL; case SN_Max: instc0 = type_enum_is_unsigned (arg_type) ? OP_IMAX_UN : OP_IMAX; +#ifdef TARGET_AMD64 + if (!COMPILE_LLVM (cfg) && instc0 == OP_IMAX_UN) + return NULL; +#endif break; case SN_Min: instc0 = type_enum_is_unsigned (arg_type) ? OP_IMIN_UN : OP_IMIN; +#ifdef TARGET_AMD64 + if (!COMPILE_LLVM (cfg) && instc0 == OP_IMIN_UN) + return NULL; +#endif break; case SN_Multiply: case SN_op_Multiply: { #ifdef TARGET_ARM64 if (!COMPILE_LLVM (cfg) && (arg_type == MONO_TYPE_I8 || arg_type == MONO_TYPE_U8 || arg_type == MONO_TYPE_I || arg_type == MONO_TYPE_U)) return NULL; +#endif +#ifdef TARGET_AMD64 + if (!COMPILE_LLVM (cfg)) + return NULL; #endif if (fsig->params [1]->type != MONO_TYPE_GENERICINST) return handle_mul_div_by_scalar (cfg, klass, arg_type, args [1]->dreg, args [0]->dreg, OP_IMUL); @@ -1009,6 +1021,8 @@ emit_hardware_intrinsics ( if (id == SN_get_IsSupported) { MonoInst *ins = NULL; EMIT_NEW_ICONST (cfg, ins, supported ? 1 : 0); + if (cfg->verbose_level > 1) + g_printf ("\t-> %s\n", supported ? "true" : " false"); return ins; } if (!supported) { @@ -1332,15 +1346,14 @@ emit_msb_shift_vector_constant (MonoCompile *cfg, MonoClass *arg_class, MonoType return msb_shift_vec; } -/* Emit intrinsics in System.Numerics.Vector and System.Runtime.Intrinsics.Vector64/128/256/512 */ +/* + * Emit intrinsics in System.Numerics.Vector and System.Runtime.Intrinsics.Vector64/128/256/512. + * If the intrinsic is not supported for some reasons, return NULL, and fall back to the c# + * implementation. + */ static MonoInst* emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args) { -#if defined(TARGET_AMD64) || defined(TARGET_WASM) - if (!COMPILE_LLVM (cfg)) - return NULL; -#endif - int id = lookup_intrins (sri_vector_methods, sizeof (sri_vector_methods), cmethod); if (id == -1) { //check_no_intrinsic_cattr (cmethod); @@ -1379,6 +1392,23 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi } #endif +#ifdef TARGET_WASM + g_assert (COMPILE_LLVM (cfg)); +#endif + +#ifdef TARGET_AMD64 + if (!COMPILE_LLVM (cfg)) { + if (vector_size != 128) + return NULL; +#ifdef TARGET_WIN32 + return NULL; +#endif + if (!is_SIMD_feature_supported (cfg, MONO_CPU_X86_SSE41)) + /* Some opcodes like pextrd require sse41 */ + return NULL; + } +#endif + MonoClass* klass = fsig->param_count > 0 ? args[0]->klass : cmethod->klass; MonoTypeEnum arg0_type = fsig->param_count > 0 ? get_underlying_type (fsig->params [0]) : MONO_TYPE_VOID; @@ -1408,10 +1438,14 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi // args [0] & ~vector(-0.0) MonoInst *zero = emit_xzero(cfg, arg_class); // 0.0 zero = emit_simd_ins (cfg, klass, OP_NEGATION, zero->dreg, -1); // -0.0 + zero->inst_c1 = arg0_type; MonoInst *ins = emit_simd_ins (cfg, klass, OP_VECTOR_ANDN, zero->dreg, args [0]->dreg); ins->inst_c1 = arg0_type; return ins; } else { + if (!COMPILE_LLVM (cfg)) + // FIXME: + return NULL; return emit_simd_ins_for_sig (cfg, klass, OP_VECTOR_IABS, -1, arg0_type, fsig, args); } #elif defined(TARGET_WASM) @@ -1486,9 +1520,22 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi #endif } case SN_ConditionalSelect: { -#if defined(TARGET_ARM64) || defined(TARGET_AMD64) || defined(TARGET_WASM) if (!is_element_type_primitive (fsig->params [0])) return NULL; + +#if defined(TARGET_ARM64) || defined(TARGET_AMD64) || defined(TARGET_WASM) + +#if defined(TARGET_AMD64) + if (!COMPILE_LLVM (cfg)) { + MonoInst *val1 = emit_simd_ins (cfg, klass, OP_XBINOP_FORCEINT, args [0]->dreg, args [1]->dreg); + val1->inst_c0 = XBINOP_FORCEINT_AND; + MonoInst *val2 = emit_simd_ins (cfg, klass, OP_VECTOR_ANDN, args [0]->dreg, args [2]->dreg); + MonoInst *ins = emit_simd_ins (cfg, klass, OP_XBINOP_FORCEINT, val1->dreg, val2->dreg); + ins->inst_c0 = XBINOP_FORCEINT_OR; + return ins; + } +#endif + return emit_simd_ins_for_sig (cfg, klass, OP_BSL, -1, arg0_type, fsig, args); #else return NULL; @@ -1512,6 +1559,13 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi op = arg0_type == MONO_TYPE_I8 ? OP_CVT_SI_FP_SCALAR : OP_CVT_UI_FP_SCALAR; else op = arg0_type == MONO_TYPE_I8 ? OP_CVT_SI_FP : OP_CVT_UI_FP; + +#ifdef TARGET_AMD64 + // Fall back to the c# code + if (!COMPILE_LLVM (cfg)) + return NULL; +#endif + return emit_simd_ins_for_sig (cfg, klass, op, -1, arg0_type, fsig, args); #else return NULL; @@ -1530,6 +1584,13 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi } #endif #if defined(TARGET_ARM64) || defined(TARGET_AMD64) + +#if defined(TARGET_AMD64) + if (!COMPILE_LLVM (cfg) && id == SN_ConvertToUInt32) + // FIXME: + return NULL; +#endif + int op = id == SN_ConvertToInt32 ? OP_CVT_FP_SI : OP_CVT_FP_UI; return emit_simd_ins_for_sig (cfg, klass, op, -1, arg0_type, fsig, args); #else @@ -1556,6 +1617,13 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi op = size == 8 ? OP_CVT_FP_SI_SCALAR : OP_CVT_FP_SI; else op = size == 8 ? OP_CVT_FP_UI_SCALAR : OP_CVT_FP_UI; + +#if defined(TARGET_AMD64) + if (!COMPILE_LLVM (cfg)) + // FIXME: + return NULL; +#endif + return emit_simd_ins_for_sig (cfg, klass, op, -1, arg0_type, fsig, args); #else return NULL; @@ -1573,6 +1641,13 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi #endif #if defined(TARGET_ARM64) || defined(TARGET_AMD64) int op = arg0_type == MONO_TYPE_I4 ? OP_CVT_SI_FP : OP_CVT_UI_FP; + +#if defined(TARGET_AMD64) + if (!COMPILE_LLVM (cfg) && op == OP_CVT_UI_FP) + // FIXME: + return NULL; +#endif + return emit_simd_ins_for_sig (cfg, klass, op, -1, arg0_type, fsig, args); #else return NULL; @@ -1591,6 +1666,11 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi // Require Vector64 SIMD support if (!COMPILE_LLVM (cfg)) return NULL; +#endif +#if defined(TARGET_AMD64) + // Require Vector64 SIMD support + if (!COMPILE_LLVM (cfg)) + return NULL; #endif return emit_simd_ins (cfg, klass, OP_XCONCAT, args [0]->dreg, args [1]->dreg); } @@ -1598,33 +1678,30 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi return emit_vector_create_elementwise (cfg, fsig, fsig->ret, arg0_type, args); break; } - case SN_CreateScalar: { - MonoType *etype = get_vector_t_elem_type (fsig->ret); - if (!MONO_TYPE_IS_VECTOR_PRIMITIVE (etype)) - return NULL; - if (COMPILE_LLVM (cfg)) - return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR, -1, arg0_type, fsig, args); - else { - if (type_enum_is_float (arg0_type)) { - return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR_FLOAT, -1, arg0_type, fsig, args); - } else { - return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR_INT, -1, arg0_type, fsig, args); - } - } - - } + case SN_CreateScalar: case SN_CreateScalarUnsafe: { MonoType *etype = get_vector_t_elem_type (fsig->ret); if (!MONO_TYPE_IS_VECTOR_PRIMITIVE (etype)) return NULL; - if (COMPILE_LLVM (cfg)) - return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR_UNSAFE, -1, arg0_type, fsig, args); - else { + gboolean is_unsafe = id == SN_CreateScalarUnsafe; + if (COMPILE_LLVM (cfg)) { + return emit_simd_ins_for_sig (cfg, klass, is_unsafe ? OP_CREATE_SCALAR_UNSAFE : OP_CREATE_SCALAR, -1, arg0_type, fsig, args); + } else { +#ifdef TARGET_AMD64 + MonoInst *ins; + + ins = emit_xzero (cfg, klass); + ins = emit_simd_ins (cfg, klass, type_to_insert_op (arg0_type), ins->dreg, args [0]->dreg); + ins->inst_c0 = 0; + ins->inst_c1 = arg0_type; + return ins; +#else if (type_enum_is_float (arg0_type)) { - return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR_UNSAFE_FLOAT, -1, arg0_type, fsig, args); + return emit_simd_ins_for_sig (cfg, klass, is_unsafe ? OP_CREATE_SCALAR_UNSAFE_FLOAT : OP_CREATE_SCALAR_FLOAT, -1, arg0_type, fsig, args); } else { - return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR_UNSAFE_INT, -1, arg0_type, fsig, args); + return emit_simd_ins_for_sig (cfg, klass, is_unsafe ? OP_CREATE_SCALAR_UNSAFE_INT : OP_CREATE_SCALAR_INT, -1, arg0_type, fsig, args); } +#endif } } case SN_Dot: { @@ -1646,21 +1723,31 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi int instc =-1; if (type_enum_is_float (arg0_type)) { if (is_SIMD_feature_supported (cfg, MONO_CPU_X86_SSE41)) { - int mask_reg = alloc_ireg (cfg); + int mask_val = -1; switch (arg0_type) { - case MONO_TYPE_R4: - instc = OP_SSE41_DPPS; - MONO_EMIT_NEW_ICONST (cfg, mask_reg, 0xf1); // 0xf1 ... 0b11110001 + case MONO_TYPE_R4: + instc = COMPILE_LLVM (cfg) ? OP_SSE41_DPPS : OP_SSE41_DPPS_IMM; + mask_val = 0xf1; // 0xf1 ... 0b11110001 + break; + case MONO_TYPE_R8: + instc = COMPILE_LLVM (cfg) ? OP_SSE41_DPPD : OP_SSE41_DPPD_IMM; + mask_val = 0x31; // 0x31 ... 0b00110001 break; - case MONO_TYPE_R8: - instc = OP_SSE41_DPPD; - MONO_EMIT_NEW_ICONST (cfg, mask_reg, 0x31); // 0x31 ... 0b00110001 - break; default: return NULL; } - MonoInst *dot = emit_simd_ins (cfg, klass, instc, args [0]->dreg, args [1]->dreg); - dot->sreg3 = mask_reg; + + MonoInst *dot; + if (COMPILE_LLVM (cfg)) { + int mask_reg = alloc_ireg (cfg); + MONO_EMIT_NEW_ICONST (cfg, mask_reg, mask_val); + + dot = emit_simd_ins (cfg, klass, instc, args [0]->dreg, args [1]->dreg); + dot->sreg3 = mask_reg; + } else { + dot = emit_simd_ins (cfg, klass, instc, args [0]->dreg, args [1]->dreg); + dot->inst_c0 = mask_val; + } return extract_first_element (cfg, klass, arg0_type, dot->dreg); } else { @@ -1670,6 +1757,10 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi if (arg0_type == MONO_TYPE_I1 || arg0_type == MONO_TYPE_U1) return NULL; // We don't support sum vector for byte, sbyte types yet + // FIXME: + if (!COMPILE_LLVM (cfg)) + return NULL; + instc = OP_IMUL; } MonoInst *pairwise_multiply = emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, instc, arg0_type, fsig, args); @@ -1800,7 +1891,14 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, args [1]->dreg, elems); MONO_EMIT_NEW_COND_EXC (cfg, GE_UN, "ArgumentOutOfRangeException"); - if (COMPILE_LLVM(cfg) || type_to_width_log2 (arg0_type) == 3) { + gboolean use_xextract; +#ifdef TARGET_AMD64 + use_xextract = FALSE; +#else + use_xextract = type_to_width_log2 (arg0_type) == 3; +#endif + + if (COMPILE_LLVM (cfg) || use_xextract) { // Use optimized paths for 64-bit extractions or whatever LLVM yields if enabled. int extract_op = type_to_xextract_op (arg0_type); return emit_simd_ins_for_sig (cfg, klass, extract_op, -1, arg0_type, fsig, args); @@ -1828,6 +1926,13 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi if (!is_element_type_primitive (fsig->params [0])) return NULL; int op = id == SN_GetLower ? OP_XLOWER : OP_XUPPER; + +#ifdef TARGET_AMD64 + if (!COMPILE_LLVM (cfg)) + /* These return a Vector64 */ + return NULL; +#endif + return emit_simd_ins_for_sig (cfg, klass, op, 0, arg0_type, fsig, args); } case SN_GreaterThan: @@ -2031,6 +2136,9 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi if (vector_size == 128 && (arg0_type == MONO_TYPE_I1 || arg0_type == MONO_TYPE_U1)) return emit_simd_ins_for_sig (cfg, klass, OP_XOP_OVR_X_X_X, INTRINS_AARCH64_ADV_SIMD_TBL1, 0, fsig, args); return NULL; +#elif defined(TARGET_AMD64) + // FIXME: + return NULL; #else return NULL; #endif @@ -2109,14 +2217,20 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi return ins; } - if (!COMPILE_LLVM(cfg) && fsig->params [0]->type != MONO_TYPE_GENERICINST) { + if (!COMPILE_LLVM (cfg) && fsig->params [0]->type != MONO_TYPE_GENERICINST) return NULL; - } MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, args [1]->dreg, elems); MONO_EMIT_NEW_COND_EXC (cfg, GE_UN, "ArgumentOutOfRangeException"); - if (COMPILE_LLVM(cfg) || type_to_width_log2 (arg0_type) == 3) { + gboolean use_xextract; +#ifdef TARGET_AMD64 + use_xextract = FALSE; +#else + use_xextract = type_to_width_log2 (arg0_type) == 3; +#endif + + if (COMPILE_LLVM (cfg) || use_xextract) { int insert_op = type_to_xinsert_op (arg0_type); MonoInst *ins = emit_simd_ins (cfg, klass, insert_op, args [0]->dreg, args [2]->dreg); ins->sreg3 = args [1]->dreg; @@ -2171,16 +2285,19 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi } #endif #if defined(TARGET_ARM64) || defined(TARGET_WASM) - int op = id == SN_WidenLower ? OP_XLOWER : OP_XUPPER; - MonoInst *lower_or_upper_half = emit_simd_ins_for_sig (cfg, klass, op, 0, arg0_type, fsig, args); - if (type_enum_is_float (arg0_type)) { - return emit_simd_ins (cfg, klass, OP_SIMD_FCVTL, lower_or_upper_half->dreg, -1); - } else { - int zero = alloc_ireg (cfg); - MONO_EMIT_NEW_ICONST (cfg, zero, 0); - op = type_enum_is_unsigned (arg0_type) ? OP_SIMD_USHLL : OP_SIMD_SSHLL; - return emit_simd_ins (cfg, klass, op, lower_or_upper_half->dreg, zero); - } + int op = id == SN_WidenLower ? OP_XLOWER : OP_XUPPER; + MonoInst *lower_or_upper_half = emit_simd_ins_for_sig (cfg, klass, op, 0, arg0_type, fsig, args); + if (type_enum_is_float (arg0_type)) { + return emit_simd_ins (cfg, klass, OP_SIMD_FCVTL, lower_or_upper_half->dreg, -1); + } else { + int zero = alloc_ireg (cfg); + MONO_EMIT_NEW_ICONST (cfg, zero, 0); + op = type_enum_is_unsigned (arg0_type) ? OP_SIMD_USHLL : OP_SIMD_SSHLL; + return emit_simd_ins (cfg, klass, op, lower_or_upper_half->dreg, zero); + } +#elif defined(TARGET_AMD64) + // FIXME: + return NULL; #else return NULL; #endif @@ -2219,6 +2336,7 @@ static guint16 vector64_vector128_t_methods [] = { SN_op_UnaryPlus, }; +/* Emit intrinsics in System.Runtime.Intrinsics.Vector64/128/256/512 */ static MonoInst* emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args) { @@ -2230,6 +2348,7 @@ emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign MonoClass *klass = cmethod->klass; MonoType *etype = mono_class_get_context (klass)->class_inst->type_argv [0]; + gboolean supported = TRUE; if (!MONO_TYPE_IS_VECTOR_PRIMITIVE (etype)) return NULL; @@ -2247,26 +2366,37 @@ emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign g_free (name); } +#if defined(TARGET_WASM) + if (!COMPILE_LLVM (cfg)) + supported = FALSE; +#endif + +// FIXME: Support Vector64 for mini JIT on arm64 +#ifdef TARGET_ARM64 + if (!COMPILE_LLVM (cfg) && (size != 16)) + return NULL; +#endif + +#ifdef TARGET_AMD64 + if (!COMPILE_LLVM (cfg) && (size != 16)) + supported = FALSE; +#ifdef TARGET_WIN32 + supported = FALSE; +#endif +#endif + switch (id) { case SN_get_IsSupported: { MonoInst *ins = NULL; - EMIT_NEW_ICONST (cfg, ins, 1); + EMIT_NEW_ICONST (cfg, ins, supported ? 1 : 0); return ins; } default: break; } -#if defined(TARGET_AMD64) || defined(TARGET_WASM) - if (!COMPILE_LLVM (cfg)) - return NULL; -#endif - -// FIXME: Support Vector64 for mini JIT on arm64 -#ifdef TARGET_ARM64 - if (!COMPILE_LLVM (cfg) && (size != 16)) + if (!supported) return NULL; -#endif switch (id) { case SN_get_Count: { @@ -2283,12 +2413,16 @@ emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign return emit_xones (cfg, klass); } case SN_get_One: { - if (size != 16) - return NULL; + guint64 buf [8]; + + /* For Vector64, the upper elements are 0 */ + g_assert (sizeof (buf) >= size); + memset (buf, 0, sizeof (buf)); + switch (etype->type) { case MONO_TYPE_I1: case MONO_TYPE_U1: { - guint8 value[16]; + guint8 *value = (guint8*)buf; for (int i = 0; i < len; ++i) { value [i] = 1; @@ -2298,7 +2432,7 @@ emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign } case MONO_TYPE_I2: case MONO_TYPE_U2: { - guint16 value[8]; + guint16 *value = (guint16*)buf; for (int i = 0; i < len; ++i) { value [i] = 1; @@ -2312,7 +2446,7 @@ emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign #endif case MONO_TYPE_I4: case MONO_TYPE_U4: { - guint32 value[4]; + guint32 *value = (guint32*)buf; for (int i = 0; i < len; ++i) { value [i] = 1; @@ -2326,7 +2460,7 @@ emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign #endif case MONO_TYPE_I8: case MONO_TYPE_U8: { - guint64 value[2]; + guint64 *value = (guint64*)buf; for (int i = 0; i < len; ++i) { value [i] = 1; @@ -2335,7 +2469,7 @@ emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign return emit_xconst_v128 (cfg, klass, (guint8*)value); } case MONO_TYPE_R4: { - float value[4]; + float *value = (float*)buf; for (int i = 0; i < len; ++i) { value [i] = 1.0f; @@ -2344,7 +2478,7 @@ emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign return emit_xconst_v128 (cfg, klass, (guint8*)value); } case MONO_TYPE_R8: { - double value[2]; + double *value = (double*)buf; for (int i = 0; i < len; ++i) { value [i] = 1.0; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Shared/VectorGetAndWithElementTest.template b/src/tests/JIT/HardwareIntrinsics/General/Shared/VectorGetAndWithElementTest.template index bc4f82d21a2e5b..68d0af4ed576d2 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Shared/VectorGetAndWithElementTest.template +++ b/src/tests/JIT/HardwareIntrinsics/General/Shared/VectorGetAndWithElementTest.template @@ -75,7 +75,7 @@ namespace JIT.HardwareIntrinsics.General if (!succeeded) { - TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}.GetElement({imm}): {nameof(RunBasicScenario)} failed to throw ArgumentOutOfRangeException."); + TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}>.GetElement({imm}): {nameof(RunBasicScenario)} failed to throw ArgumentOutOfRangeException."); TestLibrary.TestFramework.LogInformation(string.Empty); Succeeded = false; @@ -97,7 +97,7 @@ namespace JIT.HardwareIntrinsics.General if (!succeeded) { - TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}.WithElement({imm}): {nameof(RunBasicScenario)} failed to throw ArgumentOutOfRangeException."); + TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}>.WithElement({imm}): {nameof(RunBasicScenario)} failed to throw ArgumentOutOfRangeException."); TestLibrary.TestFramework.LogInformation(string.Empty); Succeeded = false; @@ -135,7 +135,7 @@ namespace JIT.HardwareIntrinsics.General if (!succeeded) { - TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}.GetElement({imm}): {nameof(RunReflectionScenario)} failed to throw ArgumentOutOfRangeException."); + TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}>.GetElement({imm}): {nameof(RunReflectionScenario)} failed to throw ArgumentOutOfRangeException."); TestLibrary.TestFramework.LogInformation(string.Empty); Succeeded = false; @@ -161,7 +161,7 @@ namespace JIT.HardwareIntrinsics.General if (!succeeded) { - TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}.WithElement({imm}): {nameof(RunReflectionScenario)} failed to throw ArgumentOutOfRangeException."); + TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}>.WithElement({imm}): {nameof(RunReflectionScenario)} failed to throw ArgumentOutOfRangeException."); TestLibrary.TestFramework.LogInformation(string.Empty); Succeeded = false; @@ -183,7 +183,7 @@ namespace JIT.HardwareIntrinsics.General { Succeeded = false; - TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}.GetElement({Imm}): {method} failed:"); + TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}>.GetElement({Imm}): {method} failed:"); TestLibrary.TestFramework.LogInformation($" value: ({string.Join(", ", values)})"); TestLibrary.TestFramework.LogInformation($" result: ({result})"); TestLibrary.TestFramework.LogInformation(string.Empty); @@ -217,9 +217,9 @@ namespace JIT.HardwareIntrinsics.General if (!succeeded) { - TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}.WithElement({Imm}): {method} failed:"); + TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}>.WithElement({Imm}): {method} failed:"); TestLibrary.TestFramework.LogInformation($" value: ({string.Join(", ", values)})"); - TestLibrary.TestFramework.LogInformation($" insert: insertedValue"); + TestLibrary.TestFramework.LogInformation($" insert: {insertedValue}"); TestLibrary.TestFramework.LogInformation($" result: ({string.Join(", ", result)})"); TestLibrary.TestFramework.LogInformation(string.Empty); diff --git a/src/tests/JIT/HardwareIntrinsics/General/Shared/VectorGetAndWithLowerAndUpperTest.template b/src/tests/JIT/HardwareIntrinsics/General/Shared/VectorGetAndWithLowerAndUpperTest.template index 1cee5a952a3ee1..1a648dab4a54a7 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Shared/VectorGetAndWithLowerAndUpperTest.template +++ b/src/tests/JIT/HardwareIntrinsics/General/Shared/VectorGetAndWithLowerAndUpperTest.template @@ -178,7 +178,7 @@ namespace JIT.HardwareIntrinsics.General if (!succeeded) { - TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}.WithLower(): {method} failed:"); + TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}>.WithLower(): {method} failed:"); TestLibrary.TestFramework.LogInformation($" value: ({string.Join(", ", values)})"); TestLibrary.TestFramework.LogInformation($" result: ({string.Join(", ", result)})"); TestLibrary.TestFramework.LogInformation(string.Empty); @@ -199,7 +199,7 @@ namespace JIT.HardwareIntrinsics.General if (!succeeded) { - TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}.WithUpper(): {method} failed:"); + TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}>.WithUpper(): {method} failed:"); TestLibrary.TestFramework.LogInformation($" value: ({string.Join(", ", values)})"); TestLibrary.TestFramework.LogInformation($" result: ({string.Join(", ", result)})"); TestLibrary.TestFramework.LogInformation(string.Empty);