Skip to content

Commit e71a958

Browse files
authored
[mono][jit] Transition the x86 backend to use SSE for fp arithmetic. (#65723)
* [mono][jit] Transition the x86 backend to use SSE for fp arithmetic. * Add SSE2 and FCMOV to the cpu requirements for mono on x86. * Also force the usage of r4fp on x86, the same as on arm. * Most of the code is copied from amd64-codegen.h and mini-amd64.c. * Reenable some tests. * Fix build failures. * Remove r4fp conditionals. * Add missing RCONV_TO_I opcode. * Fix OP_MOVE_F_TO_I4 and OP_MOVE_I4_TO_F. * Remove fpstack support code. * Fix warnings. * Add back MONO_ARCH_FLOAT32_SUPPORTED on x86. * Fix dreg type for r4_conv_to_i1 etc. opcodes.
1 parent a194555 commit e71a958

File tree

21 files changed

+1071
-891
lines changed

21 files changed

+1071
-891
lines changed

src/libraries/System.Runtime.InteropServices/tests/System.Runtime.InteropServices.UnitTests/System/Runtime/InteropServices/NFloatTests.cs

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,6 @@ public static void op_Increment(float value)
232232
[InlineData(0.0f, 3.14f)]
233233
[InlineData(4567.0f, -3.14f)]
234234
[InlineData(4567.89101f, -3.14569f)]
235-
[ActiveIssue("https://github.com/dotnet/runtime/issues/65557", typeof(PlatformDetection), nameof(PlatformDetection.IsAndroid), nameof(PlatformDetection.Is32BitProcess))]
236235
public static void op_Addition(float left, float right)
237236
{
238237
NFloat result = new NFloat(left) + new NFloat(right);
@@ -253,7 +252,6 @@ public static void op_Addition(float left, float right)
253252
[InlineData(0.0f, 3.14f)]
254253
[InlineData(4567.0f, -3.14f)]
255254
[InlineData(4567.89101f, -3.14569f)]
256-
[ActiveIssue("https://github.com/dotnet/runtime/issues/65557", typeof(PlatformDetection), nameof(PlatformDetection.IsAndroid), nameof(PlatformDetection.Is32BitProcess))]
257255
public static void op_Subtraction(float left, float right)
258256
{
259257
NFloat result = new NFloat(left) - new NFloat(right);
@@ -274,7 +272,6 @@ public static void op_Subtraction(float left, float right)
274272
[InlineData(0.0f, 3.14f)]
275273
[InlineData(4567.0f, -3.14f)]
276274
[InlineData(4567.89101f, -3.14569f)]
277-
[ActiveIssue("https://github.com/dotnet/runtime/issues/65557", typeof(PlatformDetection), nameof(PlatformDetection.IsAndroid), nameof(PlatformDetection.Is32BitProcess))]
278275
public static void op_Multiply(float left, float right)
279276
{
280277
NFloat result = new NFloat(left) * new NFloat(right);
@@ -295,7 +292,6 @@ public static void op_Multiply(float left, float right)
295292
[InlineData(0.0f, 3.14f)]
296293
[InlineData(4567.0f, -3.14f)]
297294
[InlineData(4567.89101f, -3.14569f)]
298-
[ActiveIssue("https://github.com/dotnet/runtime/issues/65557", typeof(PlatformDetection), nameof(PlatformDetection.IsAndroid), nameof(PlatformDetection.Is32BitProcess))]
299295
public static void op_Division(float left, float right)
300296
{
301297
NFloat result = new NFloat(left) / new NFloat(right);

src/mono/mono/arch/x86/x86-codegen.h

Lines changed: 500 additions & 9 deletions
Large diffs are not rendered by default.

src/mono/mono/mini/aot-compiler.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12638,7 +12638,8 @@ compile_asm (MonoAotCompile *acfg)
1263812638
#define LD_NAME "clang"
1263912639
#define LD_OPTIONS "-m32 -dynamiclib"
1264012640
#elif defined(TARGET_X86) && !defined(TARGET_MACH)
12641-
#define LD_OPTIONS "-m elf_i386 -Bsymbolic"
12641+
#define LD_NAME "ld"
12642+
#define LD_OPTIONS "--shared -m elf_i386"
1264212643
#elif defined(TARGET_ARM) && !defined(TARGET_ANDROID)
1264312644
#define LD_NAME "gcc"
1264412645
#define LD_OPTIONS "--shared -Wl,-Bsymbolic"

src/mono/mono/mini/cpu-x86.mdesc

Lines changed: 56 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ endfinally: len:16
143143
endfilter: src1:a len:16
144144
get_ex_obj: dest:a len:16
145145

146-
ckfinite: dest:f src1:f len:32
146+
ckfinite: dest:f src1:f len:40
147147
ceq: dest:y len:6
148148
cgt: dest:y len:6
149149
cgt_un: dest:y len:6
@@ -153,14 +153,18 @@ localloc: dest:i src1:i len:120
153153
compare: src1:i src2:i len:2
154154
compare_imm: src1:i len:6
155155
fcompare: src1:f src2:f clob:a len:9
156+
rcompare: src1:f src2:f clob:a len:13
156157
arglist: src1:b len:10
157158
check_this: src1:b len:3
158159
voidcall: len:17 clob:c
159160
voidcall_reg: src1:i len:11 clob:c
160161
voidcall_membase: src1:b len:16 clob:c
161-
fcall: dest:f len:17 clob:c
162-
fcall_reg: dest:f src1:i len:11 clob:c
163-
fcall_membase: dest:f src1:b len:16 clob:c
162+
fcall: dest:f len:28 clob:c
163+
fcall_reg: dest:f src1:i len:28 clob:c
164+
fcall_membase: dest:f src1:b len:28 clob:c
165+
rcall: dest:f len:28 clob:c
166+
rcall_reg: dest:f src1:i len:28 clob:c
167+
rcall_membase: dest:f src1:b len:28 clob:c
164168
lcall: dest:l len:17 clob:c
165169
lcall_reg: dest:l src1:i len:11 clob:c
166170
lcall_membase: dest:l src1:b len:16 clob:c
@@ -170,8 +174,8 @@ vcall_membase: src1:b len:16 clob:c
170174
call_reg: dest:a src1:i len:11 clob:c
171175
call_membase: dest:a src1:b len:16 clob:c
172176
iconst: dest:i len:5
173-
r4const: dest:f len:15
174-
r8const: dest:f len:16
177+
r4const: dest:f len:24
178+
r8const: dest:f len:24
175179
store_membase_imm: dest:b len:11
176180
store_membase_reg: dest:b src1:i len:7
177181
storei1_membase_imm: dest:b len:10
@@ -182,8 +186,8 @@ storei4_membase_imm: dest:b len:10
182186
storei4_membase_reg: dest:b src1:i len:7
183187
storei8_membase_imm: dest:b
184188
storei8_membase_reg: dest:b src1:i
185-
storer4_membase_reg: dest:b src1:f len:7
186-
storer8_membase_reg: dest:b src1:f len:7
189+
storer4_membase_reg: dest:b src1:f len:9
190+
storer8_membase_reg: dest:b src1:f len:9
187191
load_membase: dest:i src1:b len:7
188192
loadi1_membase: dest:y src1:b len:7
189193
loadu1_membase: dest:y src1:b len:7
@@ -192,8 +196,8 @@ loadu2_membase: dest:i src1:b len:7
192196
loadi4_membase: dest:i src1:b len:7
193197
loadu4_membase: dest:i src1:b len:7
194198
loadi8_membase: dest:i src1:b
195-
loadr4_membase: dest:f src1:b len:7
196-
loadr8_membase: dest:f src1:b len:7
199+
loadr4_membase: dest:f src1:b len:9
200+
loadr8_membase: dest:f src1:b len:9
197201
loadu4_mem: dest:i len:9
198202
move: dest:i src1:i len:2
199203
addcc_imm: dest:i src1:i len:6 clob:1
@@ -237,25 +241,26 @@ float_bge: len:22
237241
float_bge_un: len:12
238242
float_ble: len:22
239243
float_ble_un: len:12
240-
float_add: dest:f src1:f src2:f len:2
241-
float_sub: dest:f src1:f src2:f len:2
242-
float_mul: dest:f src1:f src2:f len:2
243-
float_div: dest:f src1:f src2:f len:2
244-
float_div_un: dest:f src1:f src2:f len:2
244+
float_add: dest:f src1:f src2:f len:8
245+
float_sub: dest:f src1:f src2:f len:8
246+
float_mul: dest:f src1:f src2:f len:8
247+
float_div: dest:f src1:f src2:f len:8
248+
float_div_un: dest:f src1:f src2:f len:8
245249
float_rem: dest:f src1:f src2:f len:17
246250
float_rem_un: dest:f src1:f src2:f len:17
247-
float_neg: dest:f src1:f len:2
251+
float_neg: dest:f src1:f len:24
248252
float_not: dest:f src1:f len:2
249253
float_conv_to_i1: dest:y src1:f len:39
250254
float_conv_to_i2: dest:y src1:f len:39
251255
float_conv_to_i4: dest:i src1:f len:39
252-
float_conv_to_i8: dest:L src1:f len:39
256+
float_conv_to_i8: dest:L src1:f len:50
253257
float_conv_to_u4: dest:i src1:f len:39
254258
float_conv_to_u8: dest:L src1:f len:39
255259
float_conv_to_u2: dest:y src1:f len:39
256260
float_conv_to_u1: dest:y src1:f len:39
257261
float_conv_to_ovf_i: dest:a src1:f len:30
258262
float_conv_to_ovd_u: dest:a src1:f len:30
263+
float_conv_to_r4: dest:f src1:f len:17
259264
float_mul_ovf:
260265
float_ceq: dest:y src1:f src2:f len:25
261266
float_cgt: dest:y src1:f src2:f len:25
@@ -312,7 +317,7 @@ sbb_imm: dest:i src1:i len:6 clob:1
312317
br_reg: src1:i len:2
313318
sin: dest:f src1:f len:6
314319
cos: dest:f src1:f len:6
315-
abs: dest:f src1:f len:2
320+
abs: dest:f src1:f clob:1 len:16
316321
tan: dest:f src1:f len:49
317322
atan: dest:f src1:f len:8
318323
sqrt: dest:f src1:f len:2
@@ -423,11 +428,12 @@ cmov_ile_un: dest:i src1:i src2:i len:16 clob:1
423428
cmov_ilt_un: dest:i src1:i src2:i len:16 clob:1
424429

425430
long_conv_to_ovf_i4_2: dest:i src1:i src2:i len:30
426-
long_conv_to_r8_2: dest:f src1:i src2:i len:14
427-
long_conv_to_r4_2: dest:f src1:i src2:i len:14
431+
long_conv_to_r8_2: dest:f src1:i src2:i len:24
432+
long_conv_to_r4_2: dest:f src1:i src2:i len:24
428433
long_conv_to_r_un_2: dest:f src1:i src2:i len:40
429434

430-
fmove: dest:f src1:f
435+
fmove: dest:f src1:f len:4
436+
rmove: dest:f src1:f len:4
431437
move_f_to_i4: dest:i src1:f len:17
432438
move_i4_to_f: dest:f src1:i len:17
433439
float_conv_to_r4: dest:f src1:f len:12
@@ -671,3 +677,32 @@ set_sp: src1:i len:6
671677
fill_prof_call_ctx: src1:i len:128
672678

673679
get_last_error: dest:i len:32
680+
681+
x86_move_r8_to_fpstack: src1:f len:16
682+
x86_move_r4_to_fpstack: src1:f len:16
683+
iconv_to_r4_raw: dest:f src1:i len:10
684+
685+
# R4 opcodes
686+
r4_conv_to_i1: dest:y src1:f len:32
687+
r4_conv_to_u1: dest:y src1:f len:32
688+
r4_conv_to_i2: dest:y src1:f len:32
689+
r4_conv_to_u2: dest:y src1:f len:32
690+
r4_conv_to_i4: dest:i src1:f len:16
691+
r4_conv_to_u4: dest:i src1:f len:32
692+
r4_conv_to_i8: dest:L src1:f len:64
693+
r4_conv_to_i: dest:i src1:f len:32
694+
r4_conv_to_r8: dest:f src1:f len:17
695+
r4_conv_to_r4: dest:f src1:f len:17
696+
r4_add: dest:f src1:f src2:f clob:1 len:5
697+
r4_sub: dest:f src1:f src2:f clob:1 len:5
698+
r4_mul: dest:f src1:f src2:f clob:1 len:5
699+
r4_div: dest:f src1:f src2:f clob:1 len:5
700+
r4_neg: dest:f src1:f clob:1 len:23
701+
r4_ceq: dest:y src1:f src2:f len:35
702+
r4_cgt: dest:y src1:f src2:f len:35
703+
r4_cgt_un: dest:y src1:f src2:f len:48
704+
r4_clt: dest:y src1:f src2:f len:35
705+
r4_clt_un: dest:y src1:f src2:f len:42
706+
r4_cneq: dest:y src1:f src2:f len:42
707+
r4_cge: dest:y src1:f src2:f len:35
708+
r4_cle: dest:y src1:f src2:f len:35

src/mono/mono/mini/local-propagation.c

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -623,7 +623,6 @@ mono_local_cprop (MonoCompile *cfg)
623623
/* This avoids propagating local vregs across calls */
624624
((get_vreg_to_inst (cfg, def->sreg1) || !defs [def->sreg1] || (def_index [def->sreg1] >= last_call_index) || (def->opcode == OP_VMOVE))) &&
625625
!(defs [def->sreg1] && mono_inst_next (defs [def->sreg1], filter) == def) &&
626-
(!MONO_ARCH_USE_FPSTACK || (def->opcode != OP_FMOVE)) &&
627626
(def->opcode != OP_FMOVE)) {
628627
int vreg = def->sreg1;
629628

@@ -640,7 +639,7 @@ mono_local_cprop (MonoCompile *cfg)
640639
/* is_inst_imm is only needed for binops */
641640
if ((((def->opcode == OP_ICONST) || ((sizeof (gpointer) == 8) && (def->opcode == OP_I8CONST)) || (def->opcode == OP_PCONST)))
642641
||
643-
(!MONO_ARCH_USE_FPSTACK && (def->opcode == OP_R8CONST))) {
642+
(def->opcode == OP_R8CONST)) {
644643
guint32 opcode2;
645644

646645
/* srcindex == 1 -> binop, ins->sreg2 == -1 -> unop */
@@ -815,17 +814,6 @@ mono_local_cprop (MonoCompile *cfg)
815814
}
816815
}
817816

818-
static gboolean
819-
reg_is_softreg_no_fpstack (int reg, const char spec)
820-
{
821-
return (spec == 'i' && reg >= MONO_MAX_IREGS)
822-
|| ((spec == 'f' && reg >= MONO_MAX_FREGS) && !MONO_ARCH_USE_FPSTACK)
823-
#ifdef MONO_ARCH_SIMD_INTRINSICS
824-
|| (spec == 'x' && reg >= MONO_MAX_XREGS)
825-
#endif
826-
|| (spec == 'v');
827-
}
828-
829817
static gboolean
830818
reg_is_softreg (int reg, const char spec)
831819
{
@@ -953,8 +941,7 @@ mono_local_deadce (MonoCompile *cfg)
953941
}
954942
}
955943

956-
/* Enabling this on x86 could screw up the fp stack */
957-
if (reg_is_softreg_no_fpstack (ins->dreg, spec [MONO_INST_DEST])) {
944+
if (reg_is_softreg (ins->dreg, spec [MONO_INST_DEST])) {
958945
/*
959946
* Assignments to global vregs can only be eliminated if there is another
960947
* assignment to the same vreg later in the same bblock.

src/mono/mono/mini/method-to-ir.c

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7181,12 +7181,6 @@ mono_method_to_ir (MonoCompile *cfg, MonoMethod *method, MonoBasicBlock *start_b
71817181
}
71827182
case MONO_CEE_POP:
71837183
--sp;
7184-
7185-
#ifdef TARGET_X86
7186-
if (sp [0]->type == STACK_R8)
7187-
/* we need to pop the value from the x86 FP stack */
7188-
MONO_EMIT_NEW_UNALU (cfg, OP_X86_FPOP, -1, sp [0]->dreg);
7189-
#endif
71907184
break;
71917185
case MONO_CEE_JMP: {
71927186
MonoCallInst *call;
@@ -13057,7 +13051,7 @@ mono_spill_global_vars (MonoCompile *cfg, gboolean *need_local_opts)
1305713051
* sregs could use it. So set a flag, and do it after
1305813052
* the sregs.
1305913053
*/
13060-
if ((!cfg->backend->use_fpstack || ((store_opcode != OP_STORER8_MEMBASE_REG) && (store_opcode != OP_STORER4_MEMBASE_REG))) && !((var)->flags & (MONO_INST_VOLATILE|MONO_INST_INDIRECT)))
13054+
if (!((var)->flags & (MONO_INST_VOLATILE|MONO_INST_INDIRECT)))
1306113055
dest_has_lvreg = TRUE;
1306213056
}
1306313057
}
@@ -13147,7 +13141,7 @@ mono_spill_global_vars (MonoCompile *cfg, gboolean *need_local_opts)
1314713141

1314813142
sreg = alloc_dreg (cfg, stacktypes [regtype]);
1314913143

13150-
if ((!cfg->backend->use_fpstack || ((load_opcode != OP_LOADR8_MEMBASE) && (load_opcode != OP_LOADR4_MEMBASE))) && !((var)->flags & (MONO_INST_VOLATILE|MONO_INST_INDIRECT)) && !no_lvreg) {
13144+
if (!((var)->flags & (MONO_INST_VOLATILE|MONO_INST_INDIRECT)) && !no_lvreg) {
1315113145
if (var->dreg == prev_dreg) {
1315213146
/*
1315313147
* sreg refers to the value loaded by the load

src/mono/mono/mini/mini-amd64.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -126,8 +126,6 @@ struct sigcontext {
126126
#define MONO_ARCH_USE_SHARED_FP_SIMD_BANK 1
127127
#endif
128128

129-
130-
131129
#if defined(__APPLE__)
132130
#define MONO_ARCH_SIGNAL_STACK_SIZE MINSIGSTKSZ
133131
#else
@@ -164,8 +162,6 @@ struct sigcontext {
164162
#define MONO_ARCH_CALLEE_REGS AMD64_CALLEE_REGS
165163
#define MONO_ARCH_CALLEE_SAVED_REGS AMD64_CALLEE_SAVED_REGS
166164

167-
#define MONO_ARCH_USE_FPSTACK FALSE
168-
169165
#define MONO_ARCH_INST_FIXED_REG(desc) ((desc == '\0') ? -1 : ((desc == 'i' ? -1 : ((desc == 'a') ? AMD64_RAX : ((desc == 's') ? AMD64_RCX : ((desc == 'd') ? AMD64_RDX : ((desc == 'A') ? MONO_AMD64_ARG_REG1 : -1)))))))
170166

171167
/* RDX is clobbered by the opcode implementation before accessing sreg2 */

src/mono/mono/mini/mini-arm.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,6 @@
9292
#define MONO_ARCH_CALLEE_SAVED_FREGS 0x00000000
9393
#endif
9494

95-
#define MONO_ARCH_USE_FPSTACK FALSE
96-
9795
#define MONO_ARCH_INST_SREG2_MASK(ins) (0)
9896

9997
#define MONO_ARCH_INST_FIXED_REG(desc) \

src/mono/mono/mini/mini-arm64.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,6 @@
5656

5757
#define MONO_ARCH_CALLEE_XREGS MONO_ARCH_CALLEE_FREGS
5858

59-
#define MONO_ARCH_USE_FPSTACK FALSE
60-
6159
#define MONO_ARCH_INST_SREG2_MASK(ins) (0)
6260

6361
#define MONO_ARCH_INST_FIXED_REG(desc) ((desc) == 'a' ? ARMREG_R0 : -1)
@@ -68,8 +66,6 @@
6866

6967
#define MONO_ARCH_INST_REGPAIR_REG2(desc,hreg1) (-1)
7068

71-
#define MONO_ARCH_USE_FPSTACK FALSE
72-
7369
#define MONO_ARCH_FRAME_ALIGNMENT 16
7470

7571
#define MONO_ARCH_CODE_ALIGNMENT 32

0 commit comments

Comments
 (0)