[mono][jit] Transition the x86 backend to use SSE for fp arithmetic. (#…

…65723) * [mono][jit] Transition the x86 backend to use SSE for fp arithmetic. * Add SSE2 and FCMOV to the cpu requirements for mono on x86. * Also force the usage of r4fp on x86, the same as on arm. * Most of the code is copied from amd64-codegen.h and mini-amd64.c. * Reenable some tests. * Fix build failures. * Remove r4fp conditionals. * Add missing RCONV_TO_I opcode. * Fix OP_MOVE_F_TO_I4 and OP_MOVE_I4_TO_F. * Remove fpstack support code. * Fix warnings. * Add back MONO_ARCH_FLOAT32_SUPPORTED on x86. * Fix dreg type for r4_conv_to_i1 etc. opcodes.
dotnet · Aug 8, 2022 · e71a958 · e71a958
1 parent a194555
commit e71a958
Show file tree

Hide file tree

Showing 21 changed files with 1,071 additions and 891 deletions.
diff --git a/...ts/System.Runtime.InteropServices.UnitTests/System/Runtime/InteropServices/NFloatTests.cs b/...ts/System.Runtime.InteropServices.UnitTests/System/Runtime/InteropServices/NFloatTests.cs
@@ -232,7 +232,6 @@ public static void op_Increment(float value)
         [InlineData(0.0f, 3.14f)]
         [InlineData(4567.0f, -3.14f)]
         [InlineData(4567.89101f, -3.14569f)]
-        [ActiveIssue("https://github.com/dotnet/runtime/issues/65557", typeof(PlatformDetection), nameof(PlatformDetection.IsAndroid), nameof(PlatformDetection.Is32BitProcess))]
         public static void op_Addition(float left, float right)
         {
             NFloat result = new NFloat(left) + new NFloat(right);
@@ -253,7 +252,6 @@ public static void op_Addition(float left, float right)
         [InlineData(0.0f, 3.14f)]
         [InlineData(4567.0f, -3.14f)]
         [InlineData(4567.89101f, -3.14569f)]
-        [ActiveIssue("https://github.com/dotnet/runtime/issues/65557", typeof(PlatformDetection), nameof(PlatformDetection.IsAndroid), nameof(PlatformDetection.Is32BitProcess))]
         public static void op_Subtraction(float left, float right)
         {
             NFloat result = new NFloat(left) - new NFloat(right);
@@ -274,7 +272,6 @@ public static void op_Subtraction(float left, float right)
         [InlineData(0.0f, 3.14f)]
         [InlineData(4567.0f, -3.14f)]
         [InlineData(4567.89101f, -3.14569f)]
-        [ActiveIssue("https://github.com/dotnet/runtime/issues/65557", typeof(PlatformDetection), nameof(PlatformDetection.IsAndroid), nameof(PlatformDetection.Is32BitProcess))]
         public static void op_Multiply(float left, float right)
         {
             NFloat result = new NFloat(left) * new NFloat(right);
@@ -295,7 +292,6 @@ public static void op_Multiply(float left, float right)
         [InlineData(0.0f, 3.14f)]
         [InlineData(4567.0f, -3.14f)]
         [InlineData(4567.89101f, -3.14569f)]
-        [ActiveIssue("https://github.com/dotnet/runtime/issues/65557", typeof(PlatformDetection), nameof(PlatformDetection.IsAndroid), nameof(PlatformDetection.Is32BitProcess))]
         public static void op_Division(float left, float right)
         {
             NFloat result = new NFloat(left) / new NFloat(right);

diff --git a/src/mono/mono/arch/x86/x86-codegen.h b/src/mono/mono/arch/x86/x86-codegen.h
diff --git a/src/mono/mono/mini/aot-compiler.c b/src/mono/mono/mini/aot-compiler.c
@@ -12638,7 +12638,8 @@ compile_asm (MonoAotCompile *acfg)
 #define LD_NAME "clang"
 #define LD_OPTIONS "-m32 -dynamiclib"
 #elif defined(TARGET_X86) && !defined(TARGET_MACH)
-#define LD_OPTIONS "-m elf_i386 -Bsymbolic"
+#define LD_NAME "ld"
+#define LD_OPTIONS "--shared -m elf_i386"
 #elif defined(TARGET_ARM) && !defined(TARGET_ANDROID)
 #define LD_NAME "gcc"
 #define LD_OPTIONS "--shared -Wl,-Bsymbolic"

diff --git a/src/mono/mono/mini/cpu-x86.mdesc b/src/mono/mono/mini/cpu-x86.mdesc
@@ -143,7 +143,7 @@ endfinally: len:16
 endfilter: src1:a len:16
 get_ex_obj: dest:a len:16
 
-ckfinite: dest:f src1:f len:32
+ckfinite: dest:f src1:f len:40
 ceq: dest:y len:6
 cgt: dest:y len:6
 cgt_un: dest:y len:6
@@ -153,14 +153,18 @@ localloc: dest:i src1:i len:120
 compare: src1:i src2:i len:2
 compare_imm: src1:i len:6
 fcompare: src1:f src2:f clob:a len:9
+rcompare: src1:f src2:f clob:a len:13
 arglist: src1:b len:10
 check_this: src1:b len:3
 voidcall: len:17 clob:c
 voidcall_reg: src1:i len:11 clob:c
 voidcall_membase: src1:b len:16 clob:c
-fcall: dest:f len:17 clob:c
-fcall_reg: dest:f src1:i len:11 clob:c
-fcall_membase: dest:f src1:b len:16 clob:c
+fcall: dest:f len:28 clob:c
+fcall_reg: dest:f src1:i len:28 clob:c
+fcall_membase: dest:f src1:b len:28 clob:c
+rcall: dest:f len:28 clob:c
+rcall_reg: dest:f src1:i len:28 clob:c
+rcall_membase: dest:f src1:b len:28 clob:c
 lcall: dest:l len:17 clob:c
 lcall_reg: dest:l src1:i len:11 clob:c
 lcall_membase: dest:l src1:b len:16 clob:c
@@ -170,8 +174,8 @@ vcall_membase: src1:b len:16 clob:c
 call_reg: dest:a src1:i len:11 clob:c
 call_membase: dest:a src1:b len:16 clob:c
 iconst: dest:i len:5
-r4const: dest:f len:15
-r8const: dest:f len:16
+r4const: dest:f len:24
+r8const: dest:f len:24
 store_membase_imm: dest:b len:11
 store_membase_reg: dest:b src1:i len:7
 storei1_membase_imm: dest:b len:10
@@ -182,8 +186,8 @@ storei4_membase_imm: dest:b len:10
 storei4_membase_reg: dest:b src1:i len:7
 storei8_membase_imm: dest:b
 storei8_membase_reg: dest:b src1:i
-storer4_membase_reg: dest:b src1:f len:7
-storer8_membase_reg: dest:b src1:f len:7
+storer4_membase_reg: dest:b src1:f len:9
+storer8_membase_reg: dest:b src1:f len:9
 load_membase: dest:i src1:b len:7
 loadi1_membase: dest:y src1:b len:7
 loadu1_membase: dest:y src1:b len:7
@@ -192,8 +196,8 @@ loadu2_membase: dest:i src1:b len:7
 loadi4_membase: dest:i src1:b len:7
 loadu4_membase: dest:i src1:b len:7
 loadi8_membase: dest:i src1:b
-loadr4_membase: dest:f src1:b len:7
-loadr8_membase: dest:f src1:b len:7
+loadr4_membase: dest:f src1:b len:9
+loadr8_membase: dest:f src1:b len:9
 loadu4_mem: dest:i len:9
 move: dest:i src1:i len:2
 addcc_imm: dest:i src1:i len:6 clob:1
@@ -237,25 +241,26 @@ float_bge: len:22
 float_bge_un: len:12
 float_ble: len:22
 float_ble_un: len:12
-float_add: dest:f src1:f src2:f len:2
-float_sub: dest:f src1:f src2:f len:2
-float_mul: dest:f src1:f src2:f len:2
-float_div: dest:f src1:f src2:f len:2
-float_div_un: dest:f src1:f src2:f len:2
+float_add: dest:f src1:f src2:f len:8
+float_sub: dest:f src1:f src2:f len:8
+float_mul: dest:f src1:f src2:f len:8
+float_div: dest:f src1:f src2:f len:8
+float_div_un: dest:f src1:f src2:f len:8
 float_rem: dest:f src1:f src2:f len:17
 float_rem_un: dest:f src1:f src2:f len:17
-float_neg: dest:f src1:f len:2
+float_neg: dest:f src1:f len:24
 float_not: dest:f src1:f len:2
 float_conv_to_i1: dest:y src1:f len:39
 float_conv_to_i2: dest:y src1:f len:39
 float_conv_to_i4: dest:i src1:f len:39
-float_conv_to_i8: dest:L src1:f len:39
+float_conv_to_i8: dest:L src1:f len:50
 float_conv_to_u4: dest:i src1:f len:39
 float_conv_to_u8: dest:L src1:f len:39
 float_conv_to_u2: dest:y src1:f len:39
 float_conv_to_u1: dest:y src1:f len:39
 float_conv_to_ovf_i: dest:a src1:f len:30
 float_conv_to_ovd_u: dest:a src1:f len:30
+float_conv_to_r4: dest:f src1:f len:17
 float_mul_ovf:
 float_ceq: dest:y src1:f src2:f len:25
 float_cgt: dest:y src1:f src2:f len:25
@@ -312,7 +317,7 @@ sbb_imm: dest:i src1:i len:6 clob:1
 br_reg: src1:i len:2
 sin: dest:f src1:f len:6
 cos: dest:f src1:f len:6
-abs: dest:f src1:f len:2
+abs: dest:f src1:f clob:1 len:16
 tan: dest:f src1:f len:49
 atan: dest:f src1:f len:8
 sqrt: dest:f src1:f len:2
@@ -423,11 +428,12 @@ cmov_ile_un: dest:i src1:i src2:i len:16 clob:1
 cmov_ilt_un: dest:i src1:i src2:i len:16 clob:1
 
 long_conv_to_ovf_i4_2: dest:i src1:i src2:i len:30
-long_conv_to_r8_2: dest:f src1:i src2:i len:14
-long_conv_to_r4_2: dest:f src1:i src2:i len:14
+long_conv_to_r8_2: dest:f src1:i src2:i len:24
+long_conv_to_r4_2: dest:f src1:i src2:i len:24
 long_conv_to_r_un_2: dest:f src1:i src2:i len:40
 
-fmove: dest:f src1:f
+fmove: dest:f src1:f len:4
+rmove: dest:f src1:f len:4
 move_f_to_i4: dest:i src1:f len:17
 move_i4_to_f: dest:f src1:i len:17
 float_conv_to_r4: dest:f src1:f  len:12
@@ -671,3 +677,32 @@ set_sp: src1:i len:6
 fill_prof_call_ctx: src1:i len:128
 
 get_last_error: dest:i len:32
+
+x86_move_r8_to_fpstack: src1:f len:16
+x86_move_r4_to_fpstack: src1:f len:16
+iconv_to_r4_raw: dest:f src1:i len:10
+
+# R4 opcodes
+r4_conv_to_i1: dest:y src1:f len:32
+r4_conv_to_u1: dest:y src1:f len:32
+r4_conv_to_i2: dest:y src1:f len:32
+r4_conv_to_u2: dest:y src1:f len:32
+r4_conv_to_i4: dest:i src1:f len:16
+r4_conv_to_u4: dest:i src1:f len:32
+r4_conv_to_i8: dest:L src1:f len:64
+r4_conv_to_i: dest:i src1:f len:32
+r4_conv_to_r8: dest:f src1:f len:17
+r4_conv_to_r4: dest:f src1:f len:17
+r4_add: dest:f src1:f src2:f clob:1 len:5
+r4_sub: dest:f src1:f src2:f clob:1 len:5
+r4_mul: dest:f src1:f src2:f clob:1 len:5
+r4_div: dest:f src1:f src2:f clob:1 len:5
+r4_neg: dest:f src1:f clob:1 len:23
+r4_ceq: dest:y src1:f src2:f len:35
+r4_cgt: dest:y src1:f src2:f len:35
+r4_cgt_un: dest:y src1:f src2:f len:48
+r4_clt: dest:y src1:f src2:f len:35
+r4_clt_un: dest:y src1:f src2:f len:42
+r4_cneq: dest:y src1:f src2:f len:42
+r4_cge: dest:y src1:f src2:f len:35
+r4_cle: dest:y src1:f src2:f len:35
diff --git a/src/mono/mono/mini/local-propagation.c b/src/mono/mono/mini/local-propagation.c
@@ -623,7 +623,6 @@ mono_local_cprop (MonoCompile *cfg)
 					/* This avoids propagating local vregs across calls */
 					((get_vreg_to_inst (cfg, def->sreg1) || !defs [def->sreg1] || (def_index [def->sreg1] >= last_call_index) || (def->opcode == OP_VMOVE))) &&
 					!(defs [def->sreg1] && mono_inst_next (defs [def->sreg1], filter) == def) &&
-					(!MONO_ARCH_USE_FPSTACK || (def->opcode != OP_FMOVE)) &&
 					(def->opcode != OP_FMOVE)) {
 					int vreg = def->sreg1;
 
@@ -640,7 +639,7 @@ mono_local_cprop (MonoCompile *cfg)
 				/* is_inst_imm is only needed for binops */
 				if ((((def->opcode == OP_ICONST) || ((sizeof (gpointer) == 8) && (def->opcode == OP_I8CONST)) || (def->opcode == OP_PCONST)))
 					||
-					(!MONO_ARCH_USE_FPSTACK && (def->opcode == OP_R8CONST))) {
+					(def->opcode == OP_R8CONST)) {
 					guint32 opcode2;
 
 					/* srcindex == 1 -> binop, ins->sreg2 == -1 -> unop */
@@ -815,17 +814,6 @@ mono_local_cprop (MonoCompile *cfg)
 	}
 }
 
-static gboolean
-reg_is_softreg_no_fpstack (int reg, const char spec)
-{
-	return (spec == 'i' && reg >= MONO_MAX_IREGS)
-		|| ((spec == 'f' && reg >= MONO_MAX_FREGS) && !MONO_ARCH_USE_FPSTACK)
-#ifdef MONO_ARCH_SIMD_INTRINSICS
-		|| (spec == 'x' && reg >= MONO_MAX_XREGS)
-#endif
-		|| (spec == 'v');
-}
-
 static gboolean
 reg_is_softreg (int reg, const char spec)
 {
@@ -953,8 +941,7 @@ mono_local_deadce (MonoCompile *cfg)
 				}
 			}
 
-			/* Enabling this on x86 could screw up the fp stack */
-			if (reg_is_softreg_no_fpstack (ins->dreg, spec [MONO_INST_DEST])) {
+			if (reg_is_softreg (ins->dreg, spec [MONO_INST_DEST])) {
 				/*
 				 * Assignments to global vregs can only be eliminated if there is another
 				 * assignment to the same vreg later in the same bblock.

diff --git a/src/mono/mono/mini/method-to-ir.c b/src/mono/mono/mini/method-to-ir.c
@@ -7181,12 +7181,6 @@ mono_method_to_ir (MonoCompile *cfg, MonoMethod *method, MonoBasicBlock *start_b
 		}
 		case MONO_CEE_POP:
 			--sp;
-
-#ifdef TARGET_X86
-			if (sp [0]->type == STACK_R8)
-				/* we need to pop the value from the x86 FP stack */
-				MONO_EMIT_NEW_UNALU (cfg, OP_X86_FPOP, -1, sp [0]->dreg);
-#endif
 			break;
 		case MONO_CEE_JMP: {
 			MonoCallInst *call;
@@ -13057,7 +13051,7 @@ mono_spill_global_vars (MonoCompile *cfg, gboolean *need_local_opts)
 							 * sregs could use it. So set a flag, and do it after
 							 * the sregs.
 							 */
-							if ((!cfg->backend->use_fpstack || ((store_opcode != OP_STORER8_MEMBASE_REG) && (store_opcode != OP_STORER4_MEMBASE_REG))) && !((var)->flags & (MONO_INST_VOLATILE|MONO_INST_INDIRECT)))
+							if (!((var)->flags & (MONO_INST_VOLATILE|MONO_INST_INDIRECT)))
 								dest_has_lvreg = TRUE;
 						}
 					}
@@ -13147,7 +13141,7 @@ mono_spill_global_vars (MonoCompile *cfg, gboolean *need_local_opts)
 
 							sreg = alloc_dreg (cfg, stacktypes [regtype]);
 
-							if ((!cfg->backend->use_fpstack || ((load_opcode != OP_LOADR8_MEMBASE) && (load_opcode != OP_LOADR4_MEMBASE))) && !((var)->flags & (MONO_INST_VOLATILE|MONO_INST_INDIRECT)) && !no_lvreg) {
+							if (!((var)->flags & (MONO_INST_VOLATILE|MONO_INST_INDIRECT)) && !no_lvreg) {
 								if (var->dreg == prev_dreg) {
 									/*
 									 * sreg refers to the value loaded by the load

diff --git a/src/mono/mono/mini/mini-amd64.h b/src/mono/mono/mini/mini-amd64.h
@@ -126,8 +126,6 @@ struct sigcontext {
 #define MONO_ARCH_USE_SHARED_FP_SIMD_BANK 1
 #endif
 
-
-
 #if defined(__APPLE__)
 #define MONO_ARCH_SIGNAL_STACK_SIZE MINSIGSTKSZ
 #else
@@ -164,8 +162,6 @@ struct sigcontext {
 #define MONO_ARCH_CALLEE_REGS AMD64_CALLEE_REGS
 #define MONO_ARCH_CALLEE_SAVED_REGS AMD64_CALLEE_SAVED_REGS
 
-#define MONO_ARCH_USE_FPSTACK FALSE
-
 #define MONO_ARCH_INST_FIXED_REG(desc) ((desc == '\0') ? -1 : ((desc == 'i' ? -1 : ((desc == 'a') ? AMD64_RAX : ((desc == 's') ? AMD64_RCX : ((desc == 'd') ? AMD64_RDX : ((desc == 'A') ? MONO_AMD64_ARG_REG1 : -1)))))))
 
 /* RDX is clobbered by the opcode implementation before accessing sreg2 */

diff --git a/src/mono/mono/mini/mini-arm.h b/src/mono/mono/mini/mini-arm.h
@@ -92,8 +92,6 @@
 #define MONO_ARCH_CALLEE_SAVED_FREGS 0x00000000
 #endif
 
-#define MONO_ARCH_USE_FPSTACK FALSE
-
 #define MONO_ARCH_INST_SREG2_MASK(ins) (0)
 
 #define MONO_ARCH_INST_FIXED_REG(desc) \

diff --git a/src/mono/mono/mini/mini-arm64.h b/src/mono/mono/mini/mini-arm64.h
@@ -56,8 +56,6 @@
 
 #define MONO_ARCH_CALLEE_XREGS MONO_ARCH_CALLEE_FREGS
 
-#define MONO_ARCH_USE_FPSTACK FALSE
-
 #define MONO_ARCH_INST_SREG2_MASK(ins) (0)
 
 #define MONO_ARCH_INST_FIXED_REG(desc) ((desc) == 'a' ? ARMREG_R0 : -1)
@@ -68,8 +66,6 @@
 
 #define MONO_ARCH_INST_REGPAIR_REG2(desc,hreg1) (-1)
 
-#define MONO_ARCH_USE_FPSTACK FALSE
-
 #define MONO_ARCH_FRAME_ALIGNMENT 16
 
 #define MONO_ARCH_CODE_ALIGNMENT 32