diff --git a/src/libraries/System.Runtime.InteropServices/tests/System.Runtime.InteropServices.UnitTests/System/Runtime/InteropServices/NFloatTests.cs b/src/libraries/System.Runtime.InteropServices/tests/System.Runtime.InteropServices.UnitTests/System/Runtime/InteropServices/NFloatTests.cs index 0306fc45d7c71..0c31993d24b0e 100644 --- a/src/libraries/System.Runtime.InteropServices/tests/System.Runtime.InteropServices.UnitTests/System/Runtime/InteropServices/NFloatTests.cs +++ b/src/libraries/System.Runtime.InteropServices/tests/System.Runtime.InteropServices.UnitTests/System/Runtime/InteropServices/NFloatTests.cs @@ -232,7 +232,6 @@ public static void op_Increment(float value) [InlineData(0.0f, 3.14f)] [InlineData(4567.0f, -3.14f)] [InlineData(4567.89101f, -3.14569f)] - [ActiveIssue("https://github.com/dotnet/runtime/issues/65557", typeof(PlatformDetection), nameof(PlatformDetection.IsAndroid), nameof(PlatformDetection.Is32BitProcess))] public static void op_Addition(float left, float right) { NFloat result = new NFloat(left) + new NFloat(right); @@ -253,7 +252,6 @@ public static void op_Addition(float left, float right) [InlineData(0.0f, 3.14f)] [InlineData(4567.0f, -3.14f)] [InlineData(4567.89101f, -3.14569f)] - [ActiveIssue("https://github.com/dotnet/runtime/issues/65557", typeof(PlatformDetection), nameof(PlatformDetection.IsAndroid), nameof(PlatformDetection.Is32BitProcess))] public static void op_Subtraction(float left, float right) { NFloat result = new NFloat(left) - new NFloat(right); @@ -274,7 +272,6 @@ public static void op_Subtraction(float left, float right) [InlineData(0.0f, 3.14f)] [InlineData(4567.0f, -3.14f)] [InlineData(4567.89101f, -3.14569f)] - [ActiveIssue("https://github.com/dotnet/runtime/issues/65557", typeof(PlatformDetection), nameof(PlatformDetection.IsAndroid), nameof(PlatformDetection.Is32BitProcess))] public static void op_Multiply(float left, float right) { NFloat result = new NFloat(left) * new NFloat(right); @@ -295,7 +292,6 @@ public static void op_Multiply(float left, float right) [InlineData(0.0f, 3.14f)] [InlineData(4567.0f, -3.14f)] [InlineData(4567.89101f, -3.14569f)] - [ActiveIssue("https://github.com/dotnet/runtime/issues/65557", typeof(PlatformDetection), nameof(PlatformDetection.IsAndroid), nameof(PlatformDetection.Is32BitProcess))] public static void op_Division(float left, float right) { NFloat result = new NFloat(left) / new NFloat(right); diff --git a/src/mono/mono/arch/x86/x86-codegen.h b/src/mono/mono/arch/x86/x86-codegen.h index b5a20ab65a6a9..74f96f81af407 100644 --- a/src/mono/mono/arch/x86/x86-codegen.h +++ b/src/mono/mono/arch/x86/x86-codegen.h @@ -2377,15 +2377,6 @@ typedef enum { x86_reg_emit ((inst), (dreg), (sreg)); \ } while (0) -#define x86_movd_xreg_membase(inst,sreg,basereg,disp) \ - do { \ - x86_codegen_pre(&(inst), 3 + kMaxMembaseEmitPadding); \ - x86_byte (inst, 0x66); \ - x86_byte (inst, 0x0f); \ - x86_byte (inst, 0x6e); \ - x86_membase_emit ((inst), (sreg), (basereg), (disp)); \ - } while (0) - #define x86_pshufw_reg_reg(inst,dreg,sreg,mask,high_words) \ do { \ x86_codegen_pre(&(inst), 5); \ @@ -2408,4 +2399,504 @@ typedef enum { x86_sse_alu_pd_reg_reg (inst, opc, dreg, sreg); \ } while (0) +/* + * SSE + */ + +/* Avoid errors if amd64-codegen.h is also included */ +#ifdef TARGET_X86 + +/* Two opcode SSE defines */ + +#define emit_sse_reg_reg_op2_size(inst,dreg,reg,op1,op2,size) do { \ + *(inst)++ = (unsigned char)(op1); \ + *(inst)++ = (unsigned char)(op2); \ + x86_reg_emit ((inst), (dreg), (reg)); \ +} while (0) + +#define emit_sse_reg_reg_op2(inst,dreg,reg,op1,op2) emit_sse_reg_reg_op2_size ((inst), (dreg), (reg), (op1), (op2), 0) + +#define emit_sse_reg_reg_op2_imm(inst,dreg,reg,op1,op2,imm) do { \ + emit_sse_reg_reg_op2 ((inst), (dreg), (reg), (op1), (op2)); \ + x86_imm_emit8 ((inst), (imm)); \ +} while (0) + +#define emit_sse_membase_reg_op2(inst,basereg,disp,reg,op1,op2) do { \ + *(inst)++ = (unsigned char)(op1); \ + *(inst)++ = (unsigned char)(op2); \ + x86_membase_emit ((inst), (reg), (basereg), (disp)); \ +} while (0) + +#define emit_sse_reg_membase_op2(inst,dreg,basereg,disp,op1,op2) do { \ + *(inst)++ = (unsigned char)(op1); \ + *(inst)++ = (unsigned char)(op2); \ + x86_membase_emit ((inst), (dreg), (basereg), (disp)); \ +} while (0) + +/* Three opcode SSE defines */ + +#define emit_opcode3(inst,op1,op2,op3) do { \ + *(inst)++ = (unsigned char)(op1); \ + *(inst)++ = (unsigned char)(op2); \ + *(inst)++ = (unsigned char)(op3); \ +} while (0) + +#define emit_sse_reg_reg_size(inst,dreg,reg,op1,op2,op3,size) do { \ + *(inst)++ = (unsigned char)(op1); \ + *(inst)++ = (unsigned char)(op2); \ + *(inst)++ = (unsigned char)(op3); \ + x86_reg_emit ((inst), (dreg), (reg)); \ +} while (0) + +#define emit_sse_reg_reg(inst,dreg,reg,op1,op2,op3) emit_sse_reg_reg_size ((inst), (dreg), (reg), (op1), (op2), (op3), 0) + +#define emit_sse_reg_reg_imm(inst,dreg,reg,op1,op2,op3,imm) do { \ + emit_sse_reg_reg ((inst), (dreg), (reg), (op1), (op2), (op3)); \ + x86_imm_emit8 ((inst), (imm)); \ +} while (0) + +#define emit_sse_membase_reg(inst,basereg,disp,reg,op1,op2,op3) do { \ + x86_prefix((inst), (unsigned char)(op1)); \ + *(inst)++ = (unsigned char)(op2); \ + *(inst)++ = (unsigned char)(op3); \ + x86_membase_emit ((inst), (reg), (basereg), (disp)); \ +} while (0) + +#define emit_sse_reg_membase(inst,dreg,basereg,disp,op1,op2,op3) do { \ + x86_prefix((inst), (unsigned char)(op1)); \ + *(inst)++ = (unsigned char)(op2); \ + *(inst)++ = (unsigned char)(op3); \ + x86_membase_emit ((inst), (dreg), (basereg), (disp)); \ +} while (0) + +/* 3 opcode bytes + 1 address byte */ +#define X86_SSE_REG_MEM_OFFSET (3 + 1) + +#define emit_sse_reg_mem(inst,dreg,mem,op1,op2,op3) do { \ + x86_prefix((inst), (unsigned char)(op1)); \ + *(inst)++ = (unsigned char)(op2); \ + *(inst)++ = (unsigned char)(op3); \ + x86_mem_emit ((inst), (dreg), (mem)); \ +} while (0) + +/* Four opcode SSE defines */ + +#define emit_sse_reg_reg_op4_size(inst,dreg,reg,op1,op2,op3,op4,size) do { \ + x86_prefix((inst), (unsigned char)(op1)); \ + *(inst)++ = (unsigned char)(op2); \ + *(inst)++ = (unsigned char)(op3); \ + *(inst)++ = (unsigned char)(op4); \ + x86_reg_emit ((inst), (dreg), (reg)); \ +} while (0) + +#define emit_sse_reg_reg_op4(inst,dreg,reg,op1,op2,op3,op4) emit_sse_reg_reg_op4_size ((inst), (dreg), (reg), (op1), (op2), (op3), (op4), 0) + +#define emit_sse_reg_reg_op4_imm(inst,dreg,reg,op1,op2,op3,op4,imm) do { \ + emit_sse_reg_reg_op4 ((inst), (dreg), (reg), (op1), (op2), (op3), (op4)); \ + x86_imm_emit8 ((inst), (imm)); \ +} while (0) + +#endif + +/* specific SSE opcode defines */ + +#define x86_sse_xorpd_reg_reg(inst,dreg,reg) emit_sse_reg_reg ((inst),(dreg),(reg), 0x66, 0x0f, 0x57) + +#define x86_sse_xorpd_reg_membase(inst,dreg,basereg,disp) emit_sse_reg_membase ((inst),(dreg),(basereg), (disp), 0x66, 0x0f, 0x57) + +#define x86_sse_andpd_reg_membase(inst,dreg,basereg,disp) emit_sse_reg_membase ((inst),(dreg),(basereg), (disp), 0x66, 0x0f, 0x54) + +#define x86_sse_movsd_reg_reg(inst,dreg,reg) emit_sse_reg_reg ((inst), (dreg), (reg), 0xf2, 0x0f, 0x10) +#define x86_sse_movss_reg_reg(inst,dreg,reg) emit_sse_reg_reg ((inst), (dreg), (reg), 0xf3, 0x0f, 0x10) + +#define x86_sse_movsd_reg_membase(inst,dreg,basereg,disp) emit_sse_reg_membase ((inst), (dreg), (basereg), (disp), 0xf2, 0x0f, 0x10) + +#define x86_sse_movsd_membase_reg(inst,basereg,disp,reg) emit_sse_membase_reg ((inst), (basereg), (disp), (reg), 0xf2, 0x0f, 0x11) + +#define x86_sse_movsd_reg_mem(inst,dreg,mem) emit_sse_reg_mem ((inst), (dreg), (mem), 0xf2, 0x0f, 0x10) + +#define x86_sse_movss_membase_reg(inst,basereg,disp,reg) emit_sse_membase_reg ((inst), (basereg), (disp), (reg), 0xf3, 0x0f, 0x11) + +#define x86_sse_movss_reg_membase(inst,dreg,basereg,disp) emit_sse_reg_membase ((inst), (dreg), (basereg), (disp), 0xf3, 0x0f, 0x10) + +#define x86_sse_movss_reg_mem(inst,dreg,mem) emit_sse_reg_mem ((inst), (dreg), (mem), 0xf3, 0x0f, 0x10) + +#define x86_sse_movq_reg_membase(inst,dreg,basereg,disp) emit_sse_reg_membase ((inst), (dreg), (basereg), (disp), 0xf3, 0x0f, 0x7e) + +#define x86_sse_comisd_reg_reg(inst,dreg,reg) emit_sse_reg_reg ((inst),(dreg),(reg),0x66,0x0f,0x2f) +#define x86_sse_comiss_reg_reg(inst,dreg,reg) emit_sse_reg_reg ((inst),(dreg),(reg),0x67,0x0f,0x2f) + +#define x86_sse_comisd_reg_membase(inst,dreg,basereg,disp) emit_sse_reg_membase ((inst), (dreg), (basereg), (disp), 0x66, 0x0f, 0x2f) + +#define x86_sse_ucomisd_reg_reg(inst,dreg,reg) emit_sse_reg_reg ((inst),(dreg),(reg),0x66,0x0f,0x2e) + +#define x86_sse_cvtsd2si_reg_reg(inst,dreg,reg) emit_sse_reg_reg_size ((inst), (dreg), (reg), 0xf2, 0x0f, 0x2d, 8) +#define x86_sse_cvtss2si_reg_reg(inst,dreg,reg) emit_sse_reg_reg_size ((inst), (dreg), (reg), 0xf3, 0x0f, 0x2d, 8) + +#define x86_sse_cvttsd2si_reg_reg_size(inst,dreg,reg,size) emit_sse_reg_reg_size ((inst), (dreg), (reg), 0xf2, 0x0f, 0x2c, (size)) +#define x86_sse_cvtss2si_reg_reg_size(inst,dreg,reg,size) emit_sse_reg_reg_size ((inst), (dreg), (reg), 0xf3, 0x0f, 0x2c, (size)) + +#define x86_sse_cvttsd2si_reg_reg(inst,dreg,reg) x86_sse_cvttsd2si_reg_reg_size ((inst), (dreg), (reg), 8) + +#define x86_sse_cvtsi2sd_reg_reg_size(inst,dreg,reg,size) emit_sse_reg_reg_size ((inst), (dreg), (reg), 0xf2, 0x0f, 0x2a, (size)) + +#define x86_sse_cvtsi2sd_reg_reg(inst,dreg,reg) x86_sse_cvtsi2sd_reg_reg_size ((inst), (dreg), (reg), 8) + +#define x86_sse_cvtsi2sd_reg_membase(inst,dreg,basereg,disp) emit_sse_reg_membase ((inst), (dreg), (basereg), (disp), 0xf2, 0x0f, 0x2a) + +#define x86_sse_cvtsi2ss_reg_reg_size(inst,dreg,reg,size) emit_sse_reg_reg_size ((inst), (dreg), (reg), 0xf3, 0x0f, 0x2a, (size)) + +#define x86_sse_cvtsi2ss_reg_reg(inst,dreg,reg) x86_sse_cvtsi2ss_reg_reg_size ((inst), (dreg), (reg), 8) + +#define x86_sse_cvtsd2ss_reg_reg(inst,dreg,reg) emit_sse_reg_reg ((inst), (dreg), (reg), 0xf2, 0x0f, 0x5a) + +#define x86_sse_cvtss2sd_reg_reg(inst,dreg,reg) emit_sse_reg_reg ((inst), (dreg), (reg), 0xf3, 0x0f, 0x5a) + +#define x86_sse_addsd_reg_reg(inst,dreg,reg) emit_sse_reg_reg ((inst), (dreg), (reg), 0xf2, 0x0f, 0x58) +#define x86_sse_addss_reg_reg(inst,dreg,reg) emit_sse_reg_reg ((inst), (dreg), (reg), 0xf3, 0x0f, 0x58) + +#define x86_sse_subsd_reg_reg(inst,dreg,reg) emit_sse_reg_reg ((inst), (dreg), (reg), 0xf2, 0x0f, 0x5c) +#define x86_sse_subss_reg_reg(inst,dreg,reg) emit_sse_reg_reg ((inst), (dreg), (reg), 0xf3, 0x0f, 0x5c) + +#define x86_sse_mulsd_reg_reg(inst,dreg,reg) emit_sse_reg_reg ((inst), (dreg), (reg), 0xf2, 0x0f, 0x59) +#define x86_sse_mulss_reg_reg(inst,dreg,reg) emit_sse_reg_reg ((inst), (dreg), (reg), 0xf3, 0x0f, 0x59) + +#define x86_sse_divsd_reg_reg(inst,dreg,reg) emit_sse_reg_reg ((inst), (dreg), (reg), 0xf2, 0x0f, 0x5e) +#define x86_sse_divss_reg_reg(inst,dreg,reg) emit_sse_reg_reg ((inst), (dreg), (reg), 0xf3, 0x0f, 0x5e) + +#define x86_sse_sqrtsd_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0xf2, 0x0f, 0x51) + + +#define x86_sse_pinsrw_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_imm ((inst), (dreg), (reg), 0x66, 0x0f, 0xc4, (imm)) + +#define x86_sse_pextrw_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_imm ((inst), (dreg), (reg), 0x66, 0x0f, 0xc5, (imm)) + + +#define x86_sse_cvttsd2si_reg_xreg_size(inst,reg,xreg,size) emit_sse_reg_reg_size ((inst), (reg), (xreg), 0xf2, 0x0f, 0x2c, (size)) + + +#define x86_sse_addps_reg_reg(inst,dreg,reg) emit_sse_reg_reg_op2((inst), (dreg), (reg), 0x0f, 0x58) + +#define x86_sse_divps_reg_reg(inst,dreg,reg) emit_sse_reg_reg_op2((inst), (dreg), (reg), 0x0f, 0x5e) + +#define x86_sse_mulps_reg_reg(inst,dreg,reg) emit_sse_reg_reg_op2((inst), (dreg), (reg), 0x0f, 0x59) + +#define x86_sse_subps_reg_reg(inst,dreg,reg) emit_sse_reg_reg_op2((inst), (dreg), (reg), 0x0f, 0x5c) + +#define x86_sse_maxps_reg_reg(inst,dreg,reg) emit_sse_reg_reg_op2((inst), (dreg), (reg), 0x0f, 0x5f) + +#define x86_sse_minps_reg_reg(inst,dreg,reg) emit_sse_reg_reg_op2((inst), (dreg), (reg), 0x0f, 0x5d) + +#define x86_sse_cmpps_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_op2_imm((inst), (dreg), (reg), 0x0f, 0xc2, (imm)) + +#define x86_sse_andps_reg_reg(inst,dreg,reg) emit_sse_reg_reg_op2((inst), (dreg), (reg), 0x0f, 0x54) + +#define x86_sse_andnps_reg_reg(inst,dreg,reg) emit_sse_reg_reg_op2((inst), (dreg), (reg), 0x0f, 0x55) + +#define x86_sse_orps_reg_reg(inst,dreg,reg) emit_sse_reg_reg_op2((inst), (dreg), (reg), 0x0f, 0x56) + +#define x86_sse_xorps_reg_reg(inst,dreg,reg) emit_sse_reg_reg_op2((inst), (dreg), (reg), 0x0f, 0x57) + +#define x86_sse_sqrtps_reg_reg(inst,dreg,reg) emit_sse_reg_reg_op2((inst), (dreg), (reg), 0x0f, 0x51) + +#define x86_sse_rsqrtps_reg_reg(inst,dreg,reg) emit_sse_reg_reg_op2((inst), (dreg), (reg), 0x0f, 0x52) + +#define x86_sse_rcpps_reg_reg(inst,dreg,reg) emit_sse_reg_reg_op2((inst), (dreg), (reg), 0x0f, 0x53) + +#define x86_sse_addsubps_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0xf2, 0x0f, 0xd0) + +#define x86_sse_haddps_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0xf2, 0x0f, 0x7c) + +#define x86_sse_hsubps_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0xf2, 0x0f, 0x7d) + +#define x86_sse_movshdup_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0xf3, 0x0f, 0x16) + +#define x86_sse_movsldup_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0xf3, 0x0f, 0x12) + + +#define x86_sse_pshufhw_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_imm((inst), (dreg), (reg), 0xf3, 0x0f, 0x70, (imm)) + +#define x86_sse_pshuflw_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_imm((inst), (dreg), (reg), 0xf2, 0x0f, 0x70, (imm)) + +#define x86_sse_pshufd_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_imm((inst), (dreg), (reg), 0x66, 0x0f, 0x70, (imm)) + +#define x86_sse_shufps_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_op2_imm((inst), (dreg), (reg), 0x0f, 0xC6, (imm)) + +#define x86_sse_shufpd_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_imm((inst), (dreg), (reg), 0x66, 0x0f, 0xC6, (imm)) + +#define x86_sse_roundpd_reg_reg_imm(inst, dreg, reg, imm) emit_sse_reg_reg_op4_imm((inst), (dreg), (reg), 0x66, 0x0f, 0x3a, 0x09, (imm)) + +#define x86_sse_addpd_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x58) + +#define x86_sse_divpd_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x5e) + +#define x86_sse_mulpd_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x59) + +#define x86_sse_subpd_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x5c) + +#define x86_sse_maxpd_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x5f) + +#define x86_sse_minpd_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x5d) + +#define x86_sse_cmppd_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_imm((inst), (dreg), (reg), 0x66, 0x0f, 0xc2, (imm)) + +#define x86_sse_andpd_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x54) + +#define x86_sse_andnpd_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x55) + +#define x86_sse_orpd_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x56) + +#define x86_sse_sqrtpd_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x51) + +#define x86_sse_rsqrtpd_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x52) + +#define x86_sse_rcppd_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x53) + +#define x86_sse_addsubpd_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xd0) + +#define x86_sse_haddpd_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x7c) + +#define x86_sse_hsubpd_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x7d) + +#define x86_sse_movddup_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0xf2, 0x0f, 0x12) + + +#define x86_sse_pmovmskb_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xd7) + + +#define x86_sse_pand_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xdb) + +#define x86_sse_pandn_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xdf) + +#define x86_sse_por_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xeb) + +#define x86_sse_pxor_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xef) + + +#define x86_sse_paddb_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xfc) + +#define x86_sse_paddw_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xfd) + +#define x86_sse_paddd_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xfe) + +#define x86_sse_paddq_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xd4) + + +#define x86_sse_psubb_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xf8) + +#define x86_sse_psubw_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xf9) + +#define x86_sse_psubd_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xfa) + +#define x86_sse_psubq_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xfb) + + +#define x86_sse_pmaxub_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xde) + +#define x86_sse_pmaxuw_reg_reg(inst, dreg, reg) emit_sse_reg_reg_op4((inst), (dreg), (reg), 0x66, 0x0f, 0x38, 0x3e) + +#define x86_sse_pmaxud_reg_reg(inst, dreg, reg) emit_sse_reg_reg_op4((inst), (dreg), (reg), 0x66, 0x0f, 0x38, 0x3f) + + +#define x86_sse_pmaxsb_reg_reg(inst, dreg, reg) emit_sse_reg_reg_op4((inst), (dreg), (reg), 0x66, 0x0f, 0x38, 0x3c) + +#define x86_sse_pmaxsw_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xee) + +#define x86_sse_pmaxsd_reg_reg(inst, dreg, reg) emit_sse_reg_reg_op4((inst), (dreg), (reg), 0x66, 0x0f, 0x38, 0x3d) + + +#define x86_sse_pavgb_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xe0) + +#define x86_sse_pavgw_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xe3) + + +#define x86_sse_pminub_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xda) + +#define x86_sse_pminuw_reg_reg(inst, dreg, reg) emit_sse_reg_reg_op4((inst), (dreg), (reg), 0x66, 0x0f, 0x38, 0x3a) + +#define x86_sse_pminud_reg_reg(inst, dreg, reg) emit_sse_reg_reg_op4((inst), (dreg), (reg), 0x66, 0x0f, 0x38, 0x3b) + + +#define x86_sse_pminsb_reg_reg(inst, dreg, reg) emit_sse_reg_reg_op4((inst), (dreg), (reg), 0x66, 0x0f, 0x38, 0x38) + +#define x86_sse_pminsw_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xea) + +#define x86_sse_pminsd_reg_reg(inst, dreg, reg) emit_sse_reg_reg_op4((inst), (dreg), (reg), 0x66, 0x0f, 0x38, 0x39) + + +#define x86_sse_pcmpeqb_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x74) + +#define x86_sse_pcmpeqw_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x75) + +#define x86_sse_pcmpeqd_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x76) + +#define x86_sse_pcmpeqq_reg_reg(inst, dreg, reg) emit_sse_reg_reg_op4((inst), (dreg), (reg), 0x66, 0x0f, 0x38, 0x29) + + +#define x86_sse_pcmpgtb_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x64) + +#define x86_sse_pcmpgtw_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x65) + +#define x86_sse_pcmpgtd_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x66) + +#define x86_sse_pcmpgtq_reg_reg(inst, dreg, reg) emit_sse_reg_reg_op4((inst), (dreg), (reg), 0x66, 0x0f, 0x38, 0x37) + + +#define x86_sse_psadbw_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xf6) + + +#define x86_sse_punpcklbw_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x60) + +#define x86_sse_punpcklwd_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x61) + +#define x86_sse_punpckldq_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x62) + +#define x86_sse_punpcklqdq_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x6c) + +#define x86_sse_unpcklpd_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x14) + +#define x86_sse_unpcklps_reg_reg(inst, dreg, reg) emit_sse_reg_reg_op2((inst), (dreg), (reg), 0x0f, 0x14) + + +#define x86_sse_punpckhbw_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x68) + +#define x86_sse_punpckhwd_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x69) + +#define x86_sse_punpckhdq_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x6a) + +#define x86_sse_punpckhqdq_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x6d) + +#define x86_sse_unpckhpd_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x15) + +#define x86_sse_unpckhps_reg_reg(inst, dreg, reg) emit_sse_reg_reg_op2((inst), (dreg), (reg), 0x0f, 0x15) + + +#define x86_sse_packsswb_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x63) + +#define x86_sse_packssdw_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x6b) + +#define x86_sse_packuswb_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x67) + +#define x86_sse_packusdw_reg_reg(inst, dreg, reg) emit_sse_reg_reg_op4((inst), (dreg), (reg), 0x66, 0x0f, 0x38, 0x2b) + + +#define x86_sse_paddusb_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xdc) + +#define x86_sse_psubusb_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xd8) + +#define x86_sse_paddusw_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xdd) + +#define x86_sse_psubusw_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xd8) + + +#define x86_sse_paddsb_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xec) + +#define x86_sse_psubsb_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xe8) + +#define x86_sse_paddsw_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xed) + +#define x86_sse_psubsw_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xe9) + + +#define x86_sse_pmullw_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xd5) + +#define x86_sse_pmulld_reg_reg(inst, dreg, reg) emit_sse_reg_reg_op4((inst), (dreg), (reg), 0x66, 0x0f, 0x38, 0x40) + +#define x86_sse_pmuludq_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xf4) + +#define x86_sse_pmulhuw_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xe4) + +#define x86_sse_pmulhw_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xe5) + + +#define x86_sse_psrlw_reg_imm(inst, reg, imm) emit_sse_reg_reg_imm((inst), X86_SSE_SHR, (reg), 0x66, 0x0f, 0x71, (imm)) + +#define x86_sse_psrlw_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xd1) + + +#define x86_sse_psraw_reg_imm(inst, reg, imm) emit_sse_reg_reg_imm((inst), X86_SSE_SAR, (reg), 0x66, 0x0f, 0x71, (imm)) + +#define x86_sse_psraw_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xe1) + + +#define x86_sse_psllw_reg_imm(inst, reg, imm) emit_sse_reg_reg_imm((inst), X86_SSE_SHL, (reg), 0x66, 0x0f, 0x71, (imm)) + +#define x86_sse_psllw_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xf1) + + +#define x86_sse_psrld_reg_imm(inst, reg, imm) emit_sse_reg_reg_imm((inst), X86_SSE_SHR, (reg), 0x66, 0x0f, 0x72, (imm)) + +#define x86_sse_psrld_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xd2) + + +#define x86_sse_psrad_reg_imm(inst, reg, imm) emit_sse_reg_reg_imm((inst), X86_SSE_SAR, (reg), 0x66, 0x0f, 0x72, (imm)) + +#define x86_sse_psrad_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xe2) + + +#define x86_sse_pslld_reg_imm(inst, reg, imm) emit_sse_reg_reg_imm((inst), X86_SSE_SHL, (reg), 0x66, 0x0f, 0x72, (imm)) + +#define x86_sse_pslld_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xf2) + + +#define x86_sse_psrlq_reg_imm(inst, reg, imm) emit_sse_reg_reg_imm((inst), X86_SSE_SHR, (reg), 0x66, 0x0f, 0x73, (imm)) + +#define x86_sse_psrlq_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xd3) + + +#define x86_sse_psraq_reg_imm(inst, reg, imm) emit_sse_reg_reg_imm((inst), X86_SSE_SAR, (reg), 0x66, 0x0f, 0x73, (imm)) + +#define x86_sse_psraq_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xe3) + + +#define x86_sse_psllq_reg_imm(inst, reg, imm) emit_sse_reg_reg_imm((inst), X86_SSE_SHL, (reg), 0x66, 0x0f, 0x73, (imm)) + +#define x86_sse_psllq_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0xf3) + + +#define x86_sse_cvtdq2pd_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0xF3, 0x0F, 0xE6) + +#define x86_sse_cvtdq2ps_reg_reg(inst, dreg, reg) emit_sse_reg_reg_op2((inst), (dreg), (reg), 0x0F, 0x5B) + +#define x86_sse_cvtpd2dq_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0xF2, 0x0F, 0xE6) + +#define x86_sse_cvtpd2ps_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0F, 0x5A) + +#define x86_sse_cvtps2dq_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0F, 0x5B) + +#define x86_sse_cvtps2pd_reg_reg(inst, dreg, reg) emit_sse_reg_reg_op2((inst), (dreg), (reg), 0x0F, 0x5A) + +#define x86_sse_cvttpd2dq_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0F, 0xE6) + +#define x86_sse_cvttps2dq_reg_reg(inst, dreg, reg) emit_sse_reg_reg((inst), (dreg), (reg), 0xF3, 0x0F, 0x5B) + + +#define x86_movd_xreg_reg_size(inst,dreg,sreg,size) emit_sse_reg_reg_size((inst), (dreg), (sreg), 0x66, 0x0f, 0x6e, (size)) + +#define x86_movd_reg_xreg_size(inst,dreg,sreg,size) emit_sse_reg_reg_size((inst), (sreg), (dreg), 0x66, 0x0f, 0x7e, (size)) + +#define x86_movd_xreg_membase(inst,dreg,basereg,disp) emit_sse_reg_membase((inst), (dreg), (basereg), (disp), 0x66, 0x0f, 0x6e) + + +#define x86_movlhps_reg_reg(inst,dreg,sreg) emit_sse_reg_reg_op2((inst), (dreg), (sreg), 0x0f, 0x16) + +#define x86_movhlps_reg_reg(inst,dreg,sreg) emit_sse_reg_reg_op2((inst), (dreg), (sreg), 0x0f, 0x12) + +#define x86_sse_movups_membase_reg(inst, basereg, disp, reg) emit_sse_membase_reg_op2((inst), (basereg), (disp), (reg), 0x0f, 0x11) + +#define x86_sse_movups_reg_membase(inst, dreg, basereg, disp) emit_sse_reg_membase_op2((inst), (dreg), (basereg), (disp), 0x0f, 0x10) + +#define x86_sse_movaps_membase_reg(inst, basereg, disp, reg) emit_sse_membase_reg_op2((inst), (basereg), (disp), (reg), 0x0f, 0x29) + +#define x86_sse_movaps_reg_membase(inst, dreg, basereg, disp) emit_sse_reg_membase_op2((inst), (dreg), (basereg), (disp), 0x0f, 0x28) + +#define x86_sse_movaps_reg_reg(inst, dreg, reg) emit_sse_reg_reg_op2((inst), (dreg), (reg), 0x0f, 0x28) + +#define x86_sse_movntps_reg_membase(inst, dreg, basereg, disp) emit_sse_reg_membase_op2((inst), (dreg), (basereg), (disp), 0x0f, 0x2b) + +#define x86_sse_prefetch_reg_membase(inst, arg, basereg, disp) emit_sse_reg_membase_op2((inst), (arg), (basereg), (disp), 0x0f, 0x18) + +#define x86_sse_lzcnt_reg_reg_size(inst, dreg, reg, size) emit_sse_reg_reg_size((inst), (dreg), (reg), 0xf3, 0x0f, 0xbd, (size)) +#define x86_sse_popcnt_reg_reg_size(inst, dreg, reg, size) emit_sse_reg_reg_size((inst), (dreg), (reg), 0xf3, 0x0f, 0xb8, (size)) + #endif // X86_H diff --git a/src/mono/mono/mini/aot-compiler.c b/src/mono/mono/mini/aot-compiler.c index 4b939c70fdb4e..db6f3435edd88 100644 --- a/src/mono/mono/mini/aot-compiler.c +++ b/src/mono/mono/mini/aot-compiler.c @@ -12638,7 +12638,8 @@ compile_asm (MonoAotCompile *acfg) #define LD_NAME "clang" #define LD_OPTIONS "-m32 -dynamiclib" #elif defined(TARGET_X86) && !defined(TARGET_MACH) -#define LD_OPTIONS "-m elf_i386 -Bsymbolic" +#define LD_NAME "ld" +#define LD_OPTIONS "--shared -m elf_i386" #elif defined(TARGET_ARM) && !defined(TARGET_ANDROID) #define LD_NAME "gcc" #define LD_OPTIONS "--shared -Wl,-Bsymbolic" diff --git a/src/mono/mono/mini/cpu-x86.mdesc b/src/mono/mono/mini/cpu-x86.mdesc index 96aa1fbc8b0e1..e68d613872433 100644 --- a/src/mono/mono/mini/cpu-x86.mdesc +++ b/src/mono/mono/mini/cpu-x86.mdesc @@ -143,7 +143,7 @@ endfinally: len:16 endfilter: src1:a len:16 get_ex_obj: dest:a len:16 -ckfinite: dest:f src1:f len:32 +ckfinite: dest:f src1:f len:40 ceq: dest:y len:6 cgt: dest:y len:6 cgt_un: dest:y len:6 @@ -153,14 +153,18 @@ localloc: dest:i src1:i len:120 compare: src1:i src2:i len:2 compare_imm: src1:i len:6 fcompare: src1:f src2:f clob:a len:9 +rcompare: src1:f src2:f clob:a len:13 arglist: src1:b len:10 check_this: src1:b len:3 voidcall: len:17 clob:c voidcall_reg: src1:i len:11 clob:c voidcall_membase: src1:b len:16 clob:c -fcall: dest:f len:17 clob:c -fcall_reg: dest:f src1:i len:11 clob:c -fcall_membase: dest:f src1:b len:16 clob:c +fcall: dest:f len:28 clob:c +fcall_reg: dest:f src1:i len:28 clob:c +fcall_membase: dest:f src1:b len:28 clob:c +rcall: dest:f len:28 clob:c +rcall_reg: dest:f src1:i len:28 clob:c +rcall_membase: dest:f src1:b len:28 clob:c lcall: dest:l len:17 clob:c lcall_reg: dest:l src1:i len:11 clob:c lcall_membase: dest:l src1:b len:16 clob:c @@ -170,8 +174,8 @@ vcall_membase: src1:b len:16 clob:c call_reg: dest:a src1:i len:11 clob:c call_membase: dest:a src1:b len:16 clob:c iconst: dest:i len:5 -r4const: dest:f len:15 -r8const: dest:f len:16 +r4const: dest:f len:24 +r8const: dest:f len:24 store_membase_imm: dest:b len:11 store_membase_reg: dest:b src1:i len:7 storei1_membase_imm: dest:b len:10 @@ -182,8 +186,8 @@ storei4_membase_imm: dest:b len:10 storei4_membase_reg: dest:b src1:i len:7 storei8_membase_imm: dest:b storei8_membase_reg: dest:b src1:i -storer4_membase_reg: dest:b src1:f len:7 -storer8_membase_reg: dest:b src1:f len:7 +storer4_membase_reg: dest:b src1:f len:9 +storer8_membase_reg: dest:b src1:f len:9 load_membase: dest:i src1:b len:7 loadi1_membase: dest:y src1:b len:7 loadu1_membase: dest:y src1:b len:7 @@ -192,8 +196,8 @@ loadu2_membase: dest:i src1:b len:7 loadi4_membase: dest:i src1:b len:7 loadu4_membase: dest:i src1:b len:7 loadi8_membase: dest:i src1:b -loadr4_membase: dest:f src1:b len:7 -loadr8_membase: dest:f src1:b len:7 +loadr4_membase: dest:f src1:b len:9 +loadr8_membase: dest:f src1:b len:9 loadu4_mem: dest:i len:9 move: dest:i src1:i len:2 addcc_imm: dest:i src1:i len:6 clob:1 @@ -237,25 +241,26 @@ float_bge: len:22 float_bge_un: len:12 float_ble: len:22 float_ble_un: len:12 -float_add: dest:f src1:f src2:f len:2 -float_sub: dest:f src1:f src2:f len:2 -float_mul: dest:f src1:f src2:f len:2 -float_div: dest:f src1:f src2:f len:2 -float_div_un: dest:f src1:f src2:f len:2 +float_add: dest:f src1:f src2:f len:8 +float_sub: dest:f src1:f src2:f len:8 +float_mul: dest:f src1:f src2:f len:8 +float_div: dest:f src1:f src2:f len:8 +float_div_un: dest:f src1:f src2:f len:8 float_rem: dest:f src1:f src2:f len:17 float_rem_un: dest:f src1:f src2:f len:17 -float_neg: dest:f src1:f len:2 +float_neg: dest:f src1:f len:24 float_not: dest:f src1:f len:2 float_conv_to_i1: dest:y src1:f len:39 float_conv_to_i2: dest:y src1:f len:39 float_conv_to_i4: dest:i src1:f len:39 -float_conv_to_i8: dest:L src1:f len:39 +float_conv_to_i8: dest:L src1:f len:50 float_conv_to_u4: dest:i src1:f len:39 float_conv_to_u8: dest:L src1:f len:39 float_conv_to_u2: dest:y src1:f len:39 float_conv_to_u1: dest:y src1:f len:39 float_conv_to_ovf_i: dest:a src1:f len:30 float_conv_to_ovd_u: dest:a src1:f len:30 +float_conv_to_r4: dest:f src1:f len:17 float_mul_ovf: float_ceq: dest:y src1:f src2:f len:25 float_cgt: dest:y src1:f src2:f len:25 @@ -312,7 +317,7 @@ sbb_imm: dest:i src1:i len:6 clob:1 br_reg: src1:i len:2 sin: dest:f src1:f len:6 cos: dest:f src1:f len:6 -abs: dest:f src1:f len:2 +abs: dest:f src1:f clob:1 len:16 tan: dest:f src1:f len:49 atan: dest:f src1:f len:8 sqrt: dest:f src1:f len:2 @@ -423,11 +428,12 @@ cmov_ile_un: dest:i src1:i src2:i len:16 clob:1 cmov_ilt_un: dest:i src1:i src2:i len:16 clob:1 long_conv_to_ovf_i4_2: dest:i src1:i src2:i len:30 -long_conv_to_r8_2: dest:f src1:i src2:i len:14 -long_conv_to_r4_2: dest:f src1:i src2:i len:14 +long_conv_to_r8_2: dest:f src1:i src2:i len:24 +long_conv_to_r4_2: dest:f src1:i src2:i len:24 long_conv_to_r_un_2: dest:f src1:i src2:i len:40 -fmove: dest:f src1:f +fmove: dest:f src1:f len:4 +rmove: dest:f src1:f len:4 move_f_to_i4: dest:i src1:f len:17 move_i4_to_f: dest:f src1:i len:17 float_conv_to_r4: dest:f src1:f len:12 @@ -671,3 +677,32 @@ set_sp: src1:i len:6 fill_prof_call_ctx: src1:i len:128 get_last_error: dest:i len:32 + +x86_move_r8_to_fpstack: src1:f len:16 +x86_move_r4_to_fpstack: src1:f len:16 +iconv_to_r4_raw: dest:f src1:i len:10 + +# R4 opcodes +r4_conv_to_i1: dest:y src1:f len:32 +r4_conv_to_u1: dest:y src1:f len:32 +r4_conv_to_i2: dest:y src1:f len:32 +r4_conv_to_u2: dest:y src1:f len:32 +r4_conv_to_i4: dest:i src1:f len:16 +r4_conv_to_u4: dest:i src1:f len:32 +r4_conv_to_i8: dest:L src1:f len:64 +r4_conv_to_i: dest:i src1:f len:32 +r4_conv_to_r8: dest:f src1:f len:17 +r4_conv_to_r4: dest:f src1:f len:17 +r4_add: dest:f src1:f src2:f clob:1 len:5 +r4_sub: dest:f src1:f src2:f clob:1 len:5 +r4_mul: dest:f src1:f src2:f clob:1 len:5 +r4_div: dest:f src1:f src2:f clob:1 len:5 +r4_neg: dest:f src1:f clob:1 len:23 +r4_ceq: dest:y src1:f src2:f len:35 +r4_cgt: dest:y src1:f src2:f len:35 +r4_cgt_un: dest:y src1:f src2:f len:48 +r4_clt: dest:y src1:f src2:f len:35 +r4_clt_un: dest:y src1:f src2:f len:42 +r4_cneq: dest:y src1:f src2:f len:42 +r4_cge: dest:y src1:f src2:f len:35 +r4_cle: dest:y src1:f src2:f len:35 diff --git a/src/mono/mono/mini/local-propagation.c b/src/mono/mono/mini/local-propagation.c index 39174d72b74eb..7d156c1f64569 100644 --- a/src/mono/mono/mini/local-propagation.c +++ b/src/mono/mono/mini/local-propagation.c @@ -623,7 +623,6 @@ mono_local_cprop (MonoCompile *cfg) /* This avoids propagating local vregs across calls */ ((get_vreg_to_inst (cfg, def->sreg1) || !defs [def->sreg1] || (def_index [def->sreg1] >= last_call_index) || (def->opcode == OP_VMOVE))) && !(defs [def->sreg1] && mono_inst_next (defs [def->sreg1], filter) == def) && - (!MONO_ARCH_USE_FPSTACK || (def->opcode != OP_FMOVE)) && (def->opcode != OP_FMOVE)) { int vreg = def->sreg1; @@ -640,7 +639,7 @@ mono_local_cprop (MonoCompile *cfg) /* is_inst_imm is only needed for binops */ if ((((def->opcode == OP_ICONST) || ((sizeof (gpointer) == 8) && (def->opcode == OP_I8CONST)) || (def->opcode == OP_PCONST))) || - (!MONO_ARCH_USE_FPSTACK && (def->opcode == OP_R8CONST))) { + (def->opcode == OP_R8CONST)) { guint32 opcode2; /* srcindex == 1 -> binop, ins->sreg2 == -1 -> unop */ @@ -815,17 +814,6 @@ mono_local_cprop (MonoCompile *cfg) } } -static gboolean -reg_is_softreg_no_fpstack (int reg, const char spec) -{ - return (spec == 'i' && reg >= MONO_MAX_IREGS) - || ((spec == 'f' && reg >= MONO_MAX_FREGS) && !MONO_ARCH_USE_FPSTACK) -#ifdef MONO_ARCH_SIMD_INTRINSICS - || (spec == 'x' && reg >= MONO_MAX_XREGS) -#endif - || (spec == 'v'); -} - static gboolean reg_is_softreg (int reg, const char spec) { @@ -953,8 +941,7 @@ mono_local_deadce (MonoCompile *cfg) } } - /* Enabling this on x86 could screw up the fp stack */ - if (reg_is_softreg_no_fpstack (ins->dreg, spec [MONO_INST_DEST])) { + if (reg_is_softreg (ins->dreg, spec [MONO_INST_DEST])) { /* * Assignments to global vregs can only be eliminated if there is another * assignment to the same vreg later in the same bblock. diff --git a/src/mono/mono/mini/method-to-ir.c b/src/mono/mono/mini/method-to-ir.c index 79378cd026263..cfb8f7cc1ff88 100644 --- a/src/mono/mono/mini/method-to-ir.c +++ b/src/mono/mono/mini/method-to-ir.c @@ -7181,12 +7181,6 @@ mono_method_to_ir (MonoCompile *cfg, MonoMethod *method, MonoBasicBlock *start_b } case MONO_CEE_POP: --sp; - -#ifdef TARGET_X86 - if (sp [0]->type == STACK_R8) - /* we need to pop the value from the x86 FP stack */ - MONO_EMIT_NEW_UNALU (cfg, OP_X86_FPOP, -1, sp [0]->dreg); -#endif break; case MONO_CEE_JMP: { MonoCallInst *call; @@ -13057,7 +13051,7 @@ mono_spill_global_vars (MonoCompile *cfg, gboolean *need_local_opts) * sregs could use it. So set a flag, and do it after * the sregs. */ - if ((!cfg->backend->use_fpstack || ((store_opcode != OP_STORER8_MEMBASE_REG) && (store_opcode != OP_STORER4_MEMBASE_REG))) && !((var)->flags & (MONO_INST_VOLATILE|MONO_INST_INDIRECT))) + if (!((var)->flags & (MONO_INST_VOLATILE|MONO_INST_INDIRECT))) dest_has_lvreg = TRUE; } } @@ -13147,7 +13141,7 @@ mono_spill_global_vars (MonoCompile *cfg, gboolean *need_local_opts) sreg = alloc_dreg (cfg, stacktypes [regtype]); - if ((!cfg->backend->use_fpstack || ((load_opcode != OP_LOADR8_MEMBASE) && (load_opcode != OP_LOADR4_MEMBASE))) && !((var)->flags & (MONO_INST_VOLATILE|MONO_INST_INDIRECT)) && !no_lvreg) { + if (!((var)->flags & (MONO_INST_VOLATILE|MONO_INST_INDIRECT)) && !no_lvreg) { if (var->dreg == prev_dreg) { /* * sreg refers to the value loaded by the load diff --git a/src/mono/mono/mini/mini-amd64.h b/src/mono/mono/mini/mini-amd64.h index e156944574831..80c570e58f5be 100644 --- a/src/mono/mono/mini/mini-amd64.h +++ b/src/mono/mono/mini/mini-amd64.h @@ -126,8 +126,6 @@ struct sigcontext { #define MONO_ARCH_USE_SHARED_FP_SIMD_BANK 1 #endif - - #if defined(__APPLE__) #define MONO_ARCH_SIGNAL_STACK_SIZE MINSIGSTKSZ #else @@ -164,8 +162,6 @@ struct sigcontext { #define MONO_ARCH_CALLEE_REGS AMD64_CALLEE_REGS #define MONO_ARCH_CALLEE_SAVED_REGS AMD64_CALLEE_SAVED_REGS -#define MONO_ARCH_USE_FPSTACK FALSE - #define MONO_ARCH_INST_FIXED_REG(desc) ((desc == '\0') ? -1 : ((desc == 'i' ? -1 : ((desc == 'a') ? AMD64_RAX : ((desc == 's') ? AMD64_RCX : ((desc == 'd') ? AMD64_RDX : ((desc == 'A') ? MONO_AMD64_ARG_REG1 : -1))))))) /* RDX is clobbered by the opcode implementation before accessing sreg2 */ diff --git a/src/mono/mono/mini/mini-arm.h b/src/mono/mono/mini/mini-arm.h index 12581b13992b3..73bcc8bb8bb4b 100644 --- a/src/mono/mono/mini/mini-arm.h +++ b/src/mono/mono/mini/mini-arm.h @@ -92,8 +92,6 @@ #define MONO_ARCH_CALLEE_SAVED_FREGS 0x00000000 #endif -#define MONO_ARCH_USE_FPSTACK FALSE - #define MONO_ARCH_INST_SREG2_MASK(ins) (0) #define MONO_ARCH_INST_FIXED_REG(desc) \ diff --git a/src/mono/mono/mini/mini-arm64.h b/src/mono/mono/mini/mini-arm64.h index 8feacf8f81e49..33c3a29466056 100644 --- a/src/mono/mono/mini/mini-arm64.h +++ b/src/mono/mono/mini/mini-arm64.h @@ -56,8 +56,6 @@ #define MONO_ARCH_CALLEE_XREGS MONO_ARCH_CALLEE_FREGS -#define MONO_ARCH_USE_FPSTACK FALSE - #define MONO_ARCH_INST_SREG2_MASK(ins) (0) #define MONO_ARCH_INST_FIXED_REG(desc) ((desc) == 'a' ? ARMREG_R0 : -1) @@ -68,8 +66,6 @@ #define MONO_ARCH_INST_REGPAIR_REG2(desc,hreg1) (-1) -#define MONO_ARCH_USE_FPSTACK FALSE - #define MONO_ARCH_FRAME_ALIGNMENT 16 #define MONO_ARCH_CODE_ALIGNMENT 32 diff --git a/src/mono/mono/mini/mini-codegen.c b/src/mono/mono/mini/mini-codegen.c index da91d256d9231..b391f97c6ca69 100644 --- a/src/mono/mono/mini/mini-codegen.c +++ b/src/mono/mono/mini/mini-codegen.c @@ -1135,11 +1135,6 @@ mono_local_regalloc (MonoCompile *cfg, MonoBasicBlock *bb) const char *spec; unsigned char spec_src1, spec_dest; int bank = 0; -#if MONO_ARCH_USE_FPSTACK - gboolean has_fp = FALSE; - int fpstack [8]; - int sp = 0; -#endif int num_sregs = 0; int sregs [MONO_MAX_SRC_REGS]; @@ -1258,17 +1253,6 @@ mono_local_regalloc (MonoCompile *cfg, MonoBasicBlock *bb) num_sregs = mono_inst_get_src_registers (ins, sregs); -#if MONO_ARCH_USE_FPSTACK - if (dreg_is_fp (spec)) { - has_fp = TRUE; - } else { - for (j = 0; j < num_sregs; ++j) { - if (sreg_is_fp (j, spec)) - has_fp = TRUE; - } - } -#endif - for (j = 0; j < num_sregs; ++j) { int sreg = sregs [j]; int sreg_spec = spec [MONO_INST_SRC1 + j]; @@ -2180,158 +2164,6 @@ mono_local_regalloc (MonoCompile *cfg, MonoBasicBlock *bb) DEBUG (mono_print_ins_index (i, ins)); } - - // FIXME: Set MAX_FREGS to 8 - // FIXME: Optimize generated code -#if MONO_ARCH_USE_FPSTACK - /* - * Make a forward pass over the code, simulating the fp stack, making sure the - * arguments required by the fp opcodes are at the top of the stack. - */ - if (has_fp) { - MonoInst *fxch; - int fpstack_tmp; - - g_assert (num_sregs <= 2); - - prev = NULL; - for (ins = bb->code; ins; ins = ins->next) { - spec = ins_get_spec (ins->opcode); - - DEBUG (printf ("processing:")); - DEBUG (mono_print_ins_index (0, ins)); - - if (ins->opcode == OP_FMOVE) { - /* Do it by renaming the source to the destination on the stack */ - // FIXME: Is this correct ? - for (i = 0; i < sp; ++i) - if (fpstack [i] == ins->sreg1) - fpstack [i] = ins->dreg; - prev = ins; - continue; - } - - if (sreg1_is_fp (spec) && sreg2_is_fp (spec) && (fpstack [sp - 2] != ins->sreg1)) { - /* Arg1 must be in %st(1) */ - g_assert (prev); - - i = 0; - while ((i < sp) && (fpstack [i] != ins->sreg1)) - i ++; - g_assert (i < sp); - - if (sp - 1 - i > 0) { - /* First move it to %st(0) */ - DEBUG (printf ("\tswap %%st(0) and %%st(%d)\n", sp - 1 - i)); - - MONO_INST_NEW (cfg, fxch, OP_X86_FXCH); - fxch->inst_imm = sp - 1 - i; - - mono_bblock_insert_after_ins (bb, prev, fxch); - prev = fxch; - - fpstack_tmp = fpstack [sp - 1]; - fpstack [sp - 1] = fpstack [i]; - fpstack [i] = fpstack_tmp; - } - - /* Then move it to %st(1) */ - DEBUG (printf ("\tswap %%st(0) and %%st(1)\n")); - - MONO_INST_NEW (cfg, fxch, OP_X86_FXCH); - fxch->inst_imm = 1; - - mono_bblock_insert_after_ins (bb, prev, fxch); - prev = fxch; - - fpstack_tmp = fpstack [sp - 1]; - fpstack [sp - 1] = fpstack [sp - 2]; - fpstack [sp - 2] = fpstack_tmp; - } - - if (sreg2_is_fp (spec)) { - g_assert (sp > 0); - - if (fpstack [sp - 1] != ins->sreg2) { - g_assert (prev); - - i = 0; - while ((i < sp) && (fpstack [i] != ins->sreg2)) - i ++; - g_assert (i < sp); - - DEBUG (printf ("\tswap %%st(0) and %%st(%d)\n", sp - 1 - i)); - - MONO_INST_NEW (cfg, fxch, OP_X86_FXCH); - fxch->inst_imm = sp - 1 - i; - - mono_bblock_insert_after_ins (bb, prev, fxch); - prev = fxch; - - fpstack_tmp = fpstack [sp - 1]; - fpstack [sp - 1] = fpstack [i]; - fpstack [i] = fpstack_tmp; - } - - sp --; - } - - if (sreg1_is_fp (spec)) { - g_assert (sp > 0); - - if (fpstack [sp - 1] != ins->sreg1) { - g_assert (prev); - - i = 0; - while ((i < sp) && (fpstack [i] != ins->sreg1)) - i ++; - g_assert (i < sp); - - DEBUG (printf ("\tswap %%st(0) and %%st(%d)\n", sp - 1 - i)); - - MONO_INST_NEW (cfg, fxch, OP_X86_FXCH); - fxch->inst_imm = sp - 1 - i; - - mono_bblock_insert_after_ins (bb, prev, fxch); - prev = fxch; - - fpstack_tmp = fpstack [sp - 1]; - fpstack [sp - 1] = fpstack [i]; - fpstack [i] = fpstack_tmp; - } - - sp --; - } - - if (dreg_is_fp (spec)) { - g_assert (sp < 8); - fpstack [sp ++] = ins->dreg; - } - - if (G_UNLIKELY (cfg->verbose_level >= 2)) { - printf ("\t["); - for (i = 0; i < sp; ++i) - printf ("%s%%fr%d", (i > 0) ? ", " : "", fpstack [i]); - printf ("]\n"); - } - - prev = ins; - } - - if (sp && bb != cfg->bb_exit && !(bb->out_count == 1 && bb->out_bb [0] == cfg->bb_exit)) { - /* Remove remaining items from the fp stack */ - /* - * These can remain for example as a result of a dead fmove like in - * System.Collections.Generic.EqualityComparer.Equals (). - */ - while (sp) { - MONO_INST_NEW (cfg, ins, OP_X86_FPOP); - mono_add_ins_to_end (bb, ins); - sp --; - } - } - } -#endif } MONO_RESTORE_WARNING diff --git a/src/mono/mono/mini/mini-ops.h b/src/mono/mono/mini/mini-ops.h index 5981917a49be7..c53e081ff07bc 100644 --- a/src/mono/mono/mini/mini-ops.h +++ b/src/mono/mono/mini/mini-ops.h @@ -1343,6 +1343,8 @@ MINI_OP(OP_X86_BSF32, "x86_bsf32", IREG, IREG, NONE) MINI_OP(OP_X86_BSR32, "x86_bsr32", IREG, IREG, NONE) MINI_OP(OP_X86_BSF64, "x86_bsf64", LREG, LREG, NONE) MINI_OP(OP_X86_BSR64, "x86_bsr64", LREG, LREG, NONE) +MINI_OP(OP_X86_MOVE_R8_TO_FPSTACK, "x86_move_r8_to_fpstack", NONE, FREG, NONE) +MINI_OP(OP_X86_MOVE_R4_TO_FPSTACK, "x86_move_r4_to_fpstack", NONE, FREG, NONE) #endif #if defined(TARGET_AMD64) diff --git a/src/mono/mono/mini/mini-ppc.h b/src/mono/mono/mini/mini-ppc.h index 60931e2827b73..281eee97b5bb1 100644 --- a/src/mono/mono/mini/mini-ppc.h +++ b/src/mono/mono/mini/mini-ppc.h @@ -118,12 +118,11 @@ typedef struct MonoCompileArch { #else #define MONO_ARCH_CALLEE_FREGS (0xff << ppc_f1) #endif -#define MONO_ARCH_CALLEE_SAVED_FREGS (~(MONO_ARCH_CALLEE_FREGS | 1)) - -#define MONO_ARCH_USE_FPSTACK FALSE +#define MONO_ARCH_CALLEE_SAVED_FREGS (~(MONO_ARCH_CALLEE_FRE +GS | 1)) #ifdef TARGET_POWERPC64 -#define MONO_ARCH_INST_FIXED_REG(desc) (((desc) == 'a')? ppc_r3:\ +#define MONO_ARCH_INST_FIXED_REG(desc) (((desc) == 'a')? ppc_r3: \ ((desc) == 'g'? ppc_f1:-1)) #define MONO_ARCH_INST_IS_REGPAIR(desc) FALSE #define MONO_ARCH_INST_REGPAIR_REG2(desc,hreg1) (-1) diff --git a/src/mono/mono/mini/mini-profiler.c b/src/mono/mono/mini/mini-profiler.c index 9f28d9c9e8f03..74f997f5bf5de 100644 --- a/src/mono/mono/mini/mini-profiler.c +++ b/src/mono/mono/mini/mini-profiler.c @@ -59,13 +59,6 @@ emit_fill_call_ctx (MonoCompile *cfg, MonoInst *method, MonoInst *ret) EMIT_NEW_TEMPSTORE (cfg, store, var->inst_c0, ret); EMIT_NEW_VARLOADA (cfg, addr, var, NULL); MONO_EMIT_NEW_STORE_MEMBASE (cfg, OP_STORE_MEMBASE_REG, alloc->dreg, MONO_STRUCT_OFFSET (MonoProfilerCallContext, return_value), addr->dreg); - - /* Work around a limitation of the register allocator regarding - * FP stack, see https://github.com/mono/mono/pull/17251 */ - if (cfg->backend->use_fpstack && (ret_type->type == MONO_TYPE_R8 || ret_type->type == MONO_TYPE_R4)) { - MonoInst *move_ret_back; - EMIT_NEW_VARSTORE (cfg, move_ret_back, ret, ret_type, var); - } } return alloc; diff --git a/src/mono/mono/mini/mini-riscv.h b/src/mono/mono/mini/mini-riscv.h index ba078ad93a6b1..663ee469c2fe3 100644 --- a/src/mono/mono/mini/mini-riscv.h +++ b/src/mono/mono/mini/mini-riscv.h @@ -100,8 +100,6 @@ #define MONO_ARCH_HAVE_VOLATILE_NON_PARAM_REGISTER 0 -#define MONO_ARCH_USE_FPSTACK (FALSE) - #define MONO_ARCH_FRAME_ALIGNMENT (16) #define MONO_ARCH_CODE_ALIGNMENT (32) diff --git a/src/mono/mono/mini/mini-s390x.h b/src/mono/mono/mini/mini-s390x.h index e251b8c23b789..b9deb1a2683d6 100644 --- a/src/mono/mono/mini/mini-s390x.h +++ b/src/mono/mono/mini/mini-s390x.h @@ -129,8 +129,6 @@ struct SeqPointInfo { #define MONO_ARCH_CALLEE_SAVED_FREGS 0 -#define MONO_ARCH_USE_FPSTACK FALSE - #define MONO_ARCH_INST_FIXED_REG(desc) ((desc == 'o') ? s390_r2 : \ ((desc == 'g') ? s390_f0 : \ ((desc == 'A') ? S390_FIRST_ARG_REG : -1))) diff --git a/src/mono/mono/mini/mini-wasm.h b/src/mono/mono/mini/mini-wasm.h index e83a1baefb193..61e81e8a81fbd 100644 --- a/src/mono/mono/mini/mini-wasm.h +++ b/src/mono/mono/mini/mini-wasm.h @@ -12,8 +12,6 @@ #define WASM_REG_0 0 -#define MONO_ARCH_USE_FPSTACK FALSE - // Does the ABI have a volatile non-parameter register, so tailcall // can pass context to generics or interfaces? #define MONO_ARCH_HAVE_VOLATILE_NON_PARAM_REGISTER 0 @@ -78,8 +76,6 @@ typedef struct { /* must be at a power of 2 and >= 8 */ #define MONO_ARCH_FRAME_ALIGNMENT 16 -#define MONO_ARCH_USE_FPSTACK FALSE - // Does the ABI have a volatile non-parameter register, so tailcall // can pass context to generics or interfaces? #define MONO_ARCH_HAVE_VOLATILE_NON_PARAM_REGISTER 0 diff --git a/src/mono/mono/mini/mini-x86.c b/src/mono/mono/mini/mini-x86.c index 2cd543f237007..6680b08f72d28 100644 --- a/src/mono/mono/mini/mini-x86.c +++ b/src/mono/mono/mini/mini-x86.c @@ -210,8 +210,8 @@ add_float (guint32 *gr, guint32 *stack_size, ArgInfo *ainfo, gboolean is_double) static void add_valuetype (MonoMethodSignature *sig, ArgInfo *ainfo, MonoType *type, - gboolean is_return, - guint32 *gr, const guint32 *param_regs, guint32 *fr, guint32 *stack_size) + gboolean is_return, + guint32 *gr, const guint32 *param_regs, guint32 *fr, guint32 *stack_size) { guint32 size; MonoClass *klass; @@ -868,16 +868,11 @@ mono_arch_cpu_optimizations (guint32 *exclude_mask) *exclude_mask |= MONO_OPT_CMOV; } - if (mono_hwcap_x86_has_sse2) - opts |= MONO_OPT_SSE2; - else - *exclude_mask |= MONO_OPT_SSE2; + /* The fp code requires SSE2 */ + g_assertf (mono_hwcap_x86_has_sse2, "SSE2 is required."); + g_assertf (mono_hwcap_x86_has_fcmov, "FCMOV is required."); -#ifdef MONO_ARCH_SIMD_INTRINSICS - /*SIMD intrinsics require at least SSE2.*/ - if (!mono_hwcap_x86_has_sse2) - *exclude_mask |= MONO_OPT_SIMD; -#endif + opts |= MONO_OPT_SSE2; return opts; } @@ -1482,6 +1477,25 @@ emit_gc_param_slot_def (MonoCompile *cfg, int sp_offset, MonoType *t) } } +static MonoInst* +alloc_double_spill_var (MonoCompile *cfg) +{ + if (!cfg->fconv_to_r8_x_var) { + cfg->fconv_to_r8_x_var = mono_compile_create_var (cfg, m_class_get_byval_arg (mono_defaults.double_class), OP_LOCAL); + cfg->fconv_to_r8_x_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/ + } + return cfg->fconv_to_r8_x_var; +} + +static MonoInst* +get_double_spill_var (MonoCompile *cfg) +{ + MonoInst *var = cfg->fconv_to_r8_x_var; + g_assert (var); + g_assert (var->opcode == OP_REGOFFSET); + return var; +} + void mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call) { @@ -1683,6 +1697,20 @@ mono_arch_emit_call (MonoCompile *cfg, MonoCallInst *call) call->stack_usage = cinfo->stack_usage; call->stack_align_amount = cinfo->stack_align_amount; + + switch (call->inst.opcode) { + case OP_FCALL: + case OP_FCALL_REG: + case OP_FCALL_MEMBASE: + case OP_RCALL: + case OP_RCALL_REG: + case OP_RCALL_MEMBASE: + /* Needed to move the return value from the fp stack to a sse reg */ + alloc_double_spill_var (cfg); + break; + default: + break; + } } void @@ -1739,14 +1767,20 @@ mono_arch_emit_setret (MonoCompile *cfg, MonoMethod *method, MonoInst *val) if (!m_type_is_byref (ret)) { if (ret->type == MONO_TYPE_R4) { - if (COMPILE_LLVM (cfg)) + if (COMPILE_LLVM (cfg)) { MONO_EMIT_NEW_UNALU (cfg, OP_FMOVE, cfg->ret->dreg, val->dreg); - /* Nothing to do */ + } else { + alloc_double_spill_var (cfg); + MONO_EMIT_NEW_UNALU (cfg, OP_X86_MOVE_R4_TO_FPSTACK, -1, val->dreg); + } return; } else if (ret->type == MONO_TYPE_R8) { - if (COMPILE_LLVM (cfg)) + if (COMPILE_LLVM (cfg)) { MONO_EMIT_NEW_UNALU (cfg, OP_FMOVE, cfg->ret->dreg, val->dreg); - /* Nothing to do */ + } else { + alloc_double_spill_var (cfg); + MONO_EMIT_NEW_UNALU (cfg, OP_X86_MOVE_R8_TO_FPSTACK, -1, val->dreg); + } return; } else if (ret->type == MONO_TYPE_I8 || ret->type == MONO_TYPE_U8) { if (COMPILE_LLVM (cfg)) @@ -1832,6 +1866,49 @@ emit_call (MonoCompile *cfg, guint8 *code, guint32 patch_type, gconstpointer dat return code; } +static guint8* +emit_r8const (MonoCompile *cfg, guint8* code, int dreg, double *p) +{ + double d = *p; + + if ((d == 0.0) && (mono_signbit (d) == 0)) { + x86_sse_xorpd_reg_reg (code, dreg, dreg); + } else { + if (cfg->compile_aot) { + guint32 *val = (guint32*)&d; + x86_push_imm (code, val [1]); + x86_push_imm (code, val [0]); + x86_sse_movsd_reg_membase (code, dreg, X86_ESP, 0); + x86_alu_reg_imm (code, X86_ADD, X86_ESP, 8); + } else { + mono_add_patch_info (cfg, code - cfg->native_code + X86_SSE_REG_MEM_OFFSET, MONO_PATCH_INFO_R8, p); + x86_sse_movsd_reg_mem (code, dreg, (gsize)NULL); + } + } + return code; +} + +static guint8* +emit_r4const (MonoCompile *cfg, guint8* code, int dreg, float *p) +{ + float f = *p; + + if ((f == 0.0) && (mono_signbit (f) == 0)) { + x86_sse_xorps_reg_reg (code, dreg, dreg); + } else { + if (cfg->compile_aot) { + guint32 val = *(guint32*)p; + x86_push_imm (code, val); + x86_sse_movss_reg_membase (code, dreg, X86_ESP, 0); + x86_alu_reg_imm (code, X86_ADD, X86_ESP, 4); + } else { + mono_add_patch_info (cfg, code - cfg->native_code + X86_SSE_REG_MEM_OFFSET, MONO_PATCH_INFO_R4, p); + x86_sse_movss_reg_mem (code, dreg, (gsize)NULL); + } + } + return code; +} + #define INST_IGNORES_CFLAGS(opcode) (!(((opcode) == OP_ADC) || ((opcode) == OP_IADC) || ((opcode) == OP_ADC_IMM) || ((opcode) == OP_IADC_IMM) || ((opcode) == OP_SBB) || ((opcode) == OP_ISBB) || ((opcode) == OP_SBB_IMM) || ((opcode) == OP_ISBB_IMM))) /* @@ -2059,45 +2136,13 @@ cc_signed_table [] = { }; static unsigned char* -emit_float_to_int (MonoCompile *cfg, guchar *code, int dreg, int size, gboolean is_signed) +emit_float_to_int (MonoCompile *cfg, guchar *code, int dreg, int sreg, int size, gboolean is_signed) { -#define XMM_TEMP_REG 0 - /*This SSE2 optimization must not be done which OPT_SIMD in place as it clobbers xmm0.*/ - /*The xmm pass decomposes OP_FCONV_ ops anyway anyway.*/ - if (cfg->opt & MONO_OPT_SSE2 && size < 8 && !(cfg->opt & MONO_OPT_SIMD)) { - /* optimize by assigning a local var for this use so we avoid - * the stack manipulations */ - x86_alu_reg_imm (code, X86_SUB, X86_ESP, 8); - x86_fst_membase (code, X86_ESP, 0, TRUE, TRUE); - x86_movsd_reg_membase (code, XMM_TEMP_REG, X86_ESP, 0); - x86_cvttsd2si (code, dreg, XMM_TEMP_REG); - x86_alu_reg_imm (code, X86_ADD, X86_ESP, 8); - if (size == 1) - x86_widen_reg (code, dreg, dreg, is_signed, FALSE); - else if (size == 2) - x86_widen_reg (code, dreg, dreg, is_signed, TRUE); - return code; - } - x86_alu_reg_imm (code, X86_SUB, X86_ESP, 4); - x86_fnstcw_membase(code, X86_ESP, 0); - x86_mov_reg_membase (code, dreg, X86_ESP, 0, 2); - x86_alu_reg_imm (code, X86_OR, dreg, 0xc00); - x86_mov_membase_reg (code, X86_ESP, 2, dreg, 2); - x86_fldcw_membase (code, X86_ESP, 2); - if (size == 8) { - x86_alu_reg_imm (code, X86_SUB, X86_ESP, 8); - x86_fist_pop_membase (code, X86_ESP, 0, TRUE); - x86_pop_reg (code, dreg); - /* FIXME: need the high register - * x86_pop_reg (code, dreg_high); - */ - } else { - x86_push_reg (code, X86_EAX); // SP = SP - 4 - x86_fist_pop_membase (code, X86_ESP, 0, FALSE); - x86_pop_reg (code, dreg); - } - x86_fldcw_membase (code, X86_ESP, 0); - x86_alu_reg_imm (code, X86_ADD, X86_ESP, 4); + // Use 8 as register size to get Nan/Inf conversion to uint result truncated to 0 + if (size == 8 || (!is_signed && size == 4)) + x86_sse_cvttsd2si_reg_reg (code, dreg, sreg); + else + x86_sse_cvttsd2si_reg_reg_size (code, dreg, sreg, 4); if (size == 1) x86_widen_reg (code, dreg, dreg, is_signed, FALSE); @@ -2229,6 +2274,24 @@ emit_move_return_value (MonoCompile *cfg, MonoInst *ins, guint8 *code) case OP_CALL_MEMBASE: x86_mov_reg_reg (code, ins->dreg, X86_EAX); break; + case OP_FCALL: + case OP_FCALL_REG: + case OP_FCALL_MEMBASE: { + /* The return value is on the fp stack */ + MonoInst *var = get_double_spill_var (cfg); + x86_fst_membase (code, var->inst_basereg, var->inst_offset, TRUE, TRUE); + x86_sse_movsd_reg_membase (code, ins->dreg, var->inst_basereg, var->inst_offset); + break; + } + case OP_RCALL: + case OP_RCALL_REG: + case OP_RCALL_MEMBASE: { + /* The return value is on the fp stack */ + MonoInst *var = get_double_spill_var (cfg); + x86_fst_membase (code, var->inst_basereg, var->inst_offset, FALSE, TRUE); + x86_sse_movss_reg_membase (code, ins->dreg, var->inst_basereg, var->inst_offset); + break; + } default: break; } @@ -3144,18 +3207,21 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) break; } case OP_FCALL: + case OP_FCALL_REG: + case OP_FCALL_MEMBASE: + case OP_RCALL: + case OP_RCALL_REG: + case OP_RCALL_MEMBASE: case OP_LCALL: case OP_VCALL: case OP_VCALL2: case OP_VOIDCALL: case OP_CALL: - case OP_FCALL_REG: case OP_LCALL_REG: case OP_VCALL_REG: case OP_VCALL2_REG: case OP_VOIDCALL_REG: case OP_CALL_REG: - case OP_FCALL_MEMBASE: case OP_LCALL_MEMBASE: case OP_VCALL_MEMBASE: case OP_VCALL2_MEMBASE: @@ -3168,6 +3234,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) switch (ins->opcode) { case OP_FCALL: + case OP_RCALL: case OP_LCALL: case OP_VCALL: case OP_VCALL2: @@ -3178,6 +3245,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) break; } case OP_FCALL_REG: + case OP_RCALL_REG: case OP_LCALL_REG: case OP_VCALL_REG: case OP_VCALL2_REG: @@ -3186,6 +3254,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) x86_call_reg (code, ins->sreg1); break; case OP_FCALL_MEMBASE: + case OP_RCALL_MEMBASE: case OP_LCALL_MEMBASE: case OP_VCALL_MEMBASE: case OP_VCALL2_MEMBASE: @@ -3393,129 +3462,131 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) break; /* floating point opcodes */ - case OP_R8CONST: { - double d = *(double *)ins->inst_p0; - - if ((d == 0.0) && (mono_signbit (d) == 0)) { - x86_fldz (code); - } else if (d == 1.0) { - x86_fld1 (code); - } else { - if (cfg->compile_aot) { - guint32 *val = (guint32*)&d; - x86_push_imm (code, val [1]); - x86_push_imm (code, val [0]); - x86_fld_membase (code, X86_ESP, 0, TRUE); - x86_alu_reg_imm (code, X86_ADD, X86_ESP, 8); - } - else { - mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_R8, ins->inst_p0); - x86_fld (code, (gsize)NULL, TRUE); - } - } + case OP_R8CONST: + code = emit_r8const (cfg, code, ins->dreg, (double*)ins->inst_p0); break; - } - case OP_R4CONST: { - float f = *(float *)ins->inst_p0; - - if ((f == 0.0) && (mono_signbit (f) == 0)) { - x86_fldz (code); - } else if (f == 1.0) { - x86_fld1 (code); - } else { - if (cfg->compile_aot) { - guint32 val = *(guint32*)&f; - x86_push_imm (code, val); - x86_fld_membase (code, X86_ESP, 0, FALSE); - x86_alu_reg_imm (code, X86_ADD, X86_ESP, 4); - } - else { - mono_add_patch_info (cfg, code - cfg->native_code, MONO_PATCH_INFO_R4, ins->inst_p0); - x86_fld (code, (gsize)NULL, FALSE); - } - } + case OP_R4CONST: + code = emit_r4const (cfg, code, ins->dreg, (float*)ins->inst_p0); break; - } case OP_STORER8_MEMBASE_REG: - x86_fst_membase (code, ins->inst_destbasereg, ins->inst_offset, TRUE, TRUE); + x86_sse_movsd_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, ins->sreg1); break; case OP_LOADR8_MEMBASE: - x86_fld_membase (code, ins->inst_basereg, ins->inst_offset, TRUE); + x86_sse_movsd_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset); break; case OP_STORER4_MEMBASE_REG: - x86_fst_membase (code, ins->inst_destbasereg, ins->inst_offset, FALSE, TRUE); + x86_sse_movss_membase_reg (code, ins->inst_destbasereg, ins->inst_offset, ins->sreg1); break; case OP_LOADR4_MEMBASE: - x86_fld_membase (code, ins->inst_basereg, ins->inst_offset, FALSE); + x86_sse_movss_reg_membase (code, ins->dreg, ins->inst_basereg, ins->inst_offset); + break; + case OP_X86_MOVE_R8_TO_FPSTACK: { + /* Move a value from an SSE register to the top of the fp stack */ + MonoInst *var = get_double_spill_var (cfg); + x86_sse_movsd_membase_reg (code, var->inst_basereg, var->inst_offset, ins->sreg1); + x86_fld_membase (code, var->inst_basereg, var->inst_offset, TRUE); break; + } + case OP_X86_MOVE_R4_TO_FPSTACK: { + /* Move a value from an SSE register to the top of the fp stack */ + MonoInst *var = get_double_spill_var (cfg); + x86_sse_movss_membase_reg (code, var->inst_basereg, var->inst_offset, ins->sreg1); + x86_fld_membase (code, var->inst_basereg, var->inst_offset, FALSE); + break; + } case OP_ICONV_TO_R4: - x86_push_reg (code, ins->sreg1); - x86_fild_membase (code, X86_ESP, 0, FALSE); - /* Change precision */ - x86_fst_membase (code, X86_ESP, 0, FALSE, TRUE); - x86_fld_membase (code, X86_ESP, 0, FALSE); - x86_alu_reg_imm (code, X86_ADD, X86_ESP, 4); + x86_sse_cvtsi2ss_reg_reg_size (code, ins->dreg, ins->sreg1, 4); break; case OP_ICONV_TO_R8: - x86_push_reg (code, ins->sreg1); - x86_fild_membase (code, X86_ESP, 0, FALSE); - x86_alu_reg_imm (code, X86_ADD, X86_ESP, 4); + x86_sse_cvtsi2sd_reg_reg_size (code, ins->dreg, ins->sreg1, 4); break; case OP_ICONV_TO_R_UN: x86_push_imm (code, 0); x86_push_reg (code, ins->sreg1); x86_fild_membase (code, X86_ESP, 0, TRUE); + x86_fst_membase (code, X86_ESP, 0, TRUE, TRUE); + x86_sse_movsd_reg_membase (code, ins->dreg, X86_ESP, 0); x86_alu_reg_imm (code, X86_ADD, X86_ESP, 8); break; - case OP_X86_FP_LOAD_I8: - x86_fild_membase (code, ins->inst_basereg, ins->inst_offset, TRUE); - break; - case OP_X86_FP_LOAD_I4: - x86_fild_membase (code, ins->inst_basereg, ins->inst_offset, FALSE); - break; case OP_FCONV_TO_R4: - /* Change precision */ - x86_alu_reg_imm (code, X86_SUB, X86_ESP, 4); - x86_fst_membase (code, X86_ESP, 0, FALSE, TRUE); - x86_fld_membase (code, X86_ESP, 0, FALSE); - x86_alu_reg_imm (code, X86_ADD, X86_ESP, 4); + x86_sse_cvtsd2ss_reg_reg (code, ins->dreg, ins->sreg1); break; case OP_FCONV_TO_I1: - code = emit_float_to_int (cfg, code, ins->dreg, 1, TRUE); + code = emit_float_to_int (cfg, code, ins->dreg, ins->sreg1, 1, TRUE); break; case OP_FCONV_TO_U1: - code = emit_float_to_int (cfg, code, ins->dreg, 1, FALSE); + code = emit_float_to_int (cfg, code, ins->dreg, ins->sreg1, 1, FALSE); break; case OP_FCONV_TO_I2: - code = emit_float_to_int (cfg, code, ins->dreg, 2, TRUE); + code = emit_float_to_int (cfg, code, ins->dreg, ins->sreg1, 2, TRUE); break; case OP_FCONV_TO_U2: - code = emit_float_to_int (cfg, code, ins->dreg, 2, FALSE); + code = emit_float_to_int (cfg, code, ins->dreg, ins->sreg1, 2, FALSE); break; case OP_FCONV_TO_I4: - code = emit_float_to_int (cfg, code, ins->dreg, 4, TRUE); + code = emit_float_to_int (cfg, code, ins->dreg, ins->sreg1, 4, TRUE); break; case OP_FCONV_TO_I8: + case OP_RCONV_TO_I8: x86_alu_reg_imm (code, X86_SUB, X86_ESP, 4); x86_fnstcw_membase(code, X86_ESP, 0); x86_mov_reg_membase (code, ins->dreg, X86_ESP, 0, 2); x86_alu_reg_imm (code, X86_OR, ins->dreg, 0xc00); x86_mov_membase_reg (code, X86_ESP, 2, ins->dreg, 2); x86_fldcw_membase (code, X86_ESP, 2); + x86_alu_reg_imm (code, X86_SUB, X86_ESP, 8); + if (ins->opcode == OP_FCONV_TO_I8) { + x86_sse_movsd_membase_reg (code, X86_ESP, 0, ins->sreg1); + } else { + x86_sse_cvtss2sd_reg_reg (code, MONO_ARCH_FP_SCRATCH_REG, ins->sreg1); + x86_sse_movsd_membase_reg (code, X86_ESP, 0, MONO_ARCH_FP_SCRATCH_REG); + } + x86_fld_membase (code, X86_ESP, 0, TRUE); x86_fist_pop_membase (code, X86_ESP, 0, TRUE); x86_pop_reg (code, ins->dreg); x86_pop_reg (code, ins->backend.reg3); x86_fldcw_membase (code, X86_ESP, 0); x86_alu_reg_imm (code, X86_ADD, X86_ESP, 4); break; + case OP_RCONV_TO_I1: + x86_sse_cvtss2si_reg_reg_size (code, ins->dreg, ins->sreg1, 4); + x86_widen_reg (code, ins->dreg, ins->dreg, TRUE, FALSE); + break; + case OP_RCONV_TO_U1: + x86_sse_cvtss2si_reg_reg_size (code, ins->dreg, ins->sreg1, 4); + x86_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE); + break; + case OP_RCONV_TO_I2: + x86_sse_cvtss2si_reg_reg_size (code, ins->dreg, ins->sreg1, 4); + x86_widen_reg (code, ins->dreg, ins->dreg, TRUE, TRUE); + break; + case OP_RCONV_TO_U2: + x86_sse_cvtss2si_reg_reg_size (code, ins->dreg, ins->sreg1, 4); + x86_widen_reg (code, ins->dreg, ins->dreg, FALSE, TRUE); + break; + case OP_RCONV_TO_I4: + case OP_RCONV_TO_I: + x86_sse_cvtss2si_reg_reg_size (code, ins->dreg, ins->sreg1, 4); + break; + case OP_RCONV_TO_U4: + // FIXME: + x86_sse_cvtss2si_reg_reg (code, ins->dreg, ins->sreg1); + break; + case OP_RCONV_TO_R8: + x86_sse_cvtss2sd_reg_reg (code, ins->dreg, ins->sreg1); + break; + case OP_RCONV_TO_R4: + if (ins->dreg != ins->sreg1) + x86_sse_movss_reg_reg (code, ins->dreg, ins->sreg1); + break; + case OP_LCONV_TO_R8_2: x86_push_reg (code, ins->sreg2); x86_push_reg (code, ins->sreg1); x86_fild_membase (code, X86_ESP, 0, TRUE); - /* Change precision */ x86_fst_membase (code, X86_ESP, 0, TRUE, TRUE); - x86_fld_membase (code, X86_ESP, 0, TRUE); + x86_sse_movsd_reg_membase (code, ins->dreg, X86_ESP, 0); x86_alu_reg_imm (code, X86_ADD, X86_ESP, 8); break; case OP_LCONV_TO_R4_2: @@ -3524,7 +3595,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) x86_fild_membase (code, X86_ESP, 0, TRUE); /* Change precision */ x86_fst_membase (code, X86_ESP, 0, FALSE, TRUE); - x86_fld_membase (code, X86_ESP, 0, FALSE); + x86_sse_movss_reg_membase (code, ins->dreg, X86_ESP, 0); x86_alu_reg_imm (code, X86_ADD, X86_ESP, 8); break; case OP_LCONV_TO_R_UN_2: { @@ -3554,12 +3625,10 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) x86_patch (br, code); - /* Change precision */ + /* Move from the fp stack to dreg and change precision */ x86_fst_membase (code, X86_ESP, 0, TRUE, TRUE); - x86_fld_membase (code, X86_ESP, 0, TRUE); - + x86_sse_movsd_reg_membase (code, ins->dreg, X86_ESP, 0); x86_alu_reg_imm (code, X86_ADD, X86_ESP, 8); - break; } case OP_LCONV_TO_OVF_I: @@ -3606,81 +3675,71 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) break; } case OP_FMOVE: - /* Not needed on the fp stack */ + if (ins->dreg != ins->sreg1) + x86_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1); + break; + case OP_RMOVE: + if (ins->dreg != ins->sreg1) + x86_sse_movss_reg_reg (code, ins->dreg, ins->sreg1); break; case OP_MOVE_F_TO_I4: - x86_fst_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, FALSE, TRUE); - x86_mov_reg_membase (code, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, 4); + x86_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 4); break; case OP_MOVE_I4_TO_F: - x86_mov_membase_reg (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, ins->sreg1, 4); - x86_fld_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, FALSE); + x86_movd_xreg_reg_size (code, ins->dreg, ins->sreg1, 4); break; case OP_FADD: - x86_fp_op_reg (code, X86_FADD, 1, TRUE); + x86_sse_addsd_reg_reg (code, ins->dreg, ins->sreg2); break; case OP_FSUB: - x86_fp_op_reg (code, X86_FSUB, 1, TRUE); + x86_sse_subsd_reg_reg (code, ins->dreg, ins->sreg2); + break; + case OP_FNEG: { + static double r8_0 = -0.0; + + g_assert (ins->sreg1 == ins->dreg); + + code = emit_r8const (cfg, code, MONO_ARCH_FP_SCRATCH_REG, &r8_0); + x86_sse_xorpd_reg_reg (code, ins->dreg, MONO_ARCH_FP_SCRATCH_REG); break; + } + case OP_ABS: { + static guint64 d = 0x7fffffffffffffffUL; + + g_assert (ins->sreg1 == ins->dreg); + + code = emit_r8const (cfg, code, MONO_ARCH_FP_SCRATCH_REG, (double*)&d); + x86_sse_andpd_reg_reg (code, ins->dreg, MONO_ARCH_FP_SCRATCH_REG); + break; + } case OP_FMUL: - x86_fp_op_reg (code, X86_FMUL, 1, TRUE); + x86_sse_mulsd_reg_reg (code, ins->dreg, ins->sreg2); break; case OP_FDIV: - x86_fp_op_reg (code, X86_FDIV, 1, TRUE); + x86_sse_divsd_reg_reg (code, ins->dreg, ins->sreg2); break; - case OP_FNEG: - x86_fchs (code); + case OP_RADD: + x86_sse_addss_reg_reg (code, ins->dreg, ins->sreg2); break; - case OP_ABS: - x86_fabs (code); - break; - case OP_TAN: { - /* - * it really doesn't make sense to inline all this code, - * it's here just to show that things may not be as simple - * as they appear. - */ - guchar *check_pos, *end_tan, *pop_jump; - x86_push_reg (code, X86_EAX); - x86_fptan (code); - x86_fnstsw (code); - x86_test_reg_imm (code, X86_EAX, X86_FP_C2); - check_pos = code; - x86_branch8 (code, X86_CC_NE, 0, FALSE); - x86_fstp (code, 0); /* pop the 1.0 */ - end_tan = code; - x86_jump8 (code, 0); - x86_fldpi (code); - x86_fp_op (code, X86_FADD, 0); - x86_fxch (code, 1); - x86_fprem1 (code); - x86_fstsw (code); - x86_test_reg_imm (code, X86_EAX, X86_FP_C2); - pop_jump = code; - x86_branch8 (code, X86_CC_NE, 0, FALSE); - x86_fstp (code, 1); - x86_fptan (code); - x86_patch (pop_jump, code); - x86_fstp (code, 0); /* pop the 1.0 */ - x86_patch (check_pos, code); - x86_patch (end_tan, code); - x86_fldz (code); - x86_fp_op_reg (code, X86_FADD, 1, TRUE); - x86_pop_reg (code, X86_EAX); + case OP_RSUB: + x86_sse_subss_reg_reg (code, ins->dreg, ins->sreg2); break; - } - case OP_ATAN: - x86_fld1 (code); - x86_fpatan (code); - x86_fldz (code); - x86_fp_op_reg (code, X86_FADD, 1, TRUE); + case OP_RMUL: + x86_sse_mulss_reg_reg (code, ins->dreg, ins->sreg2); break; - case OP_SQRT: - x86_fsqrt (code); + case OP_RDIV: + x86_sse_divss_reg_reg (code, ins->dreg, ins->sreg2); break; - case OP_ROUND: - x86_frndint (code); + case OP_RNEG: { + static float r4_0 = -0.0; + + g_assert (ins->sreg1 == ins->dreg); + + code = emit_r4const (cfg, code, MONO_ARCH_FP_SCRATCH_REG, &r4_0); + x86_sse_xorps_reg_reg (code, ins->dreg, MONO_ARCH_FP_SCRATCH_REG); break; + } + case OP_IMIN: g_assert (cfg->opt & MONO_OPT_CMOV); g_assert (ins->dreg == ins->sreg1); @@ -3711,379 +3770,255 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) case OP_X86_FXCH: x86_fxch (code, ins->inst_imm); break; - case OP_FREM: { - guint8 *l1, *l2; - - x86_push_reg (code, X86_EAX); - /* we need to exchange ST(0) with ST(1) */ - x86_fxch (code, 1); - - /* this requires a loop, because fprem sometimes - * returns a partial remainder */ - l1 = code; - /* looks like MS is using fprem instead of the IEEE compatible fprem1 */ - /* x86_fprem1 (code); */ - x86_fprem (code); - x86_fnstsw (code); - x86_alu_reg_imm (code, X86_AND, X86_EAX, X86_FP_C2); - l2 = code; - x86_branch8 (code, X86_CC_NE, 0, FALSE); - x86_patch (l2, l1); - - /* pop result */ - x86_fstp (code, 1); - - x86_pop_reg (code, X86_EAX); - break; - } case OP_FCOMPARE: - if (cfg->opt & MONO_OPT_FCMOV) { - x86_fcomip (code, 1); - x86_fstp (code, 0); - break; - } - /* this overwrites EAX */ - EMIT_FPCOMPARE(code); - x86_alu_reg_imm (code, X86_AND, X86_EAX, X86_FP_CC_MASK); + /* + * The two arguments are swapped because the fbranch instructions + * depend on this for the non-sse case to work. + * FIXME: Get rid of this. + */ + x86_sse_comisd_reg_reg (code, ins->sreg2, ins->sreg1); + break; + case OP_RCOMPARE: + /* + * FIXME: Get rid of this. + * The two arguments are swapped because the fbranch instructions + * depend on this for the non-sse case to work. + */ + x86_sse_comiss_reg_reg (code, ins->sreg2, ins->sreg1); break; - case OP_FCEQ: case OP_FCNEQ: - if (cfg->opt & MONO_OPT_FCMOV) { - /* zeroing the register at the start results in - * shorter and faster code (we can also remove the widening op) - */ - guchar *unordered_check; - x86_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg); - x86_fcomip (code, 1); - x86_fstp (code, 0); - unordered_check = code; - x86_branch8 (code, X86_CC_P, 0, FALSE); - if (ins->opcode == OP_FCEQ) { - x86_set_reg (code, X86_CC_EQ, ins->dreg, FALSE); - x86_patch (unordered_check, code); - } else { - guchar *jump_to_end; - x86_set_reg (code, X86_CC_NE, ins->dreg, FALSE); - jump_to_end = code; - x86_jump8 (code, 0); - x86_patch (unordered_check, code); - x86_inc_reg (code, ins->dreg); - x86_patch (jump_to_end, code); - } - - break; - } - if (ins->dreg != X86_EAX) - x86_push_reg (code, X86_EAX); + case OP_FCEQ: + case OP_RCNEQ: { + /* zeroing the register at the start results in + * shorter and faster code (we can also remove the widening op) + */ + guchar *unordered_check; - EMIT_FPCOMPARE(code); - x86_alu_reg_imm (code, X86_AND, X86_EAX, X86_FP_CC_MASK); - x86_alu_reg_imm (code, X86_CMP, X86_EAX, 0x4000); - x86_set_reg (code, ins->opcode == OP_FCEQ ? X86_CC_EQ : X86_CC_NE, ins->dreg, TRUE); - x86_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE); + x86_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg); + if (ins->opcode == OP_RCNEQ) + x86_sse_comiss_reg_reg (code, ins->sreg1, ins->sreg2); + else + x86_sse_comisd_reg_reg (code, ins->sreg1, ins->sreg2); + unordered_check = code; + x86_branch8 (code, X86_CC_P, 0, FALSE); - if (ins->dreg != X86_EAX) - x86_pop_reg (code, X86_EAX); + if (ins->opcode == OP_FCEQ) { + x86_set_reg (code, X86_CC_EQ, ins->dreg, FALSE); + x86_patch (unordered_check, code); + } else { + guchar *jump_to_end; + x86_set_reg (code, X86_CC_NE, ins->dreg, FALSE); + jump_to_end = code; + x86_jump8 (code, 0); + x86_patch (unordered_check, code); + x86_inc_reg (code, ins->dreg); + x86_patch (jump_to_end, code); + } break; + } case OP_FCLT: - case OP_FCLT_UN: - if (cfg->opt & MONO_OPT_FCMOV) { - /* zeroing the register at the start results in - * shorter and faster code (we can also remove the widening op) - */ - x86_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg); - x86_fcomip (code, 1); - x86_fstp (code, 0); - if (ins->opcode == OP_FCLT_UN) { - guchar *unordered_check = code; - guchar *jump_to_end; - x86_branch8 (code, X86_CC_P, 0, FALSE); - x86_set_reg (code, X86_CC_GT, ins->dreg, FALSE); - jump_to_end = code; - x86_jump8 (code, 0); - x86_patch (unordered_check, code); - x86_inc_reg (code, ins->dreg); - x86_patch (jump_to_end, code); - } else { - x86_set_reg (code, X86_CC_GT, ins->dreg, FALSE); - } - break; - } - if (ins->dreg != X86_EAX) - x86_push_reg (code, X86_EAX); - - EMIT_FPCOMPARE(code); - x86_alu_reg_imm (code, X86_AND, X86_EAX, X86_FP_CC_MASK); + case OP_FCLT_UN: { + /* zeroing the register at the start results in + * shorter and faster code (we can also remove the widening op) + */ + x86_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg); + x86_sse_comisd_reg_reg (code, ins->sreg2, ins->sreg1); if (ins->opcode == OP_FCLT_UN) { - guchar *is_not_zero_check, *end_jump; - is_not_zero_check = code; - x86_branch8 (code, X86_CC_NZ, 0, TRUE); - end_jump = code; + guchar *unordered_check = code; + guchar *jump_to_end; + x86_branch8 (code, X86_CC_P, 0, FALSE); + x86_set_reg (code, X86_CC_GT, ins->dreg, FALSE); + jump_to_end = code; x86_jump8 (code, 0); - x86_patch (is_not_zero_check, code); - x86_alu_reg_imm (code, X86_CMP, X86_EAX, X86_FP_CC_MASK); - - x86_patch (end_jump, code); + x86_patch (unordered_check, code); + x86_inc_reg (code, ins->dreg); + x86_patch (jump_to_end, code); + } else { + x86_set_reg (code, X86_CC_GT, ins->dreg, FALSE); } - x86_set_reg (code, X86_CC_EQ, ins->dreg, TRUE); - x86_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE); - - if (ins->dreg != X86_EAX) - x86_pop_reg (code, X86_EAX); break; + } case OP_FCLE: { guchar *unordered_check; - guchar *jump_to_end; - if (cfg->opt & MONO_OPT_FCMOV) { - /* zeroing the register at the start results in - * shorter and faster code (we can also remove the widening op) - */ - x86_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg); - x86_fcomip (code, 1); - x86_fstp (code, 0); - unordered_check = code; - x86_branch8 (code, X86_CC_P, 0, FALSE); - x86_set_reg (code, X86_CC_NB, ins->dreg, FALSE); - x86_patch (unordered_check, code); - break; - } - if (ins->dreg != X86_EAX) - x86_push_reg (code, X86_EAX); - - EMIT_FPCOMPARE(code); - x86_alu_reg_imm (code, X86_AND, X86_EAX, X86_FP_CC_MASK); - x86_alu_reg_imm (code, X86_CMP, X86_EAX, 0x4500); + x86_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg); + x86_sse_comisd_reg_reg (code, ins->sreg2, ins->sreg1); unordered_check = code; - x86_branch8 (code, X86_CC_EQ, 0, FALSE); - - x86_alu_reg_imm (code, X86_CMP, X86_EAX, X86_FP_C0); - x86_set_reg (code, X86_CC_NE, ins->dreg, TRUE); - x86_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE); - jump_to_end = code; - x86_jump8 (code, 0); + x86_branch8 (code, X86_CC_P, 0, FALSE); + x86_set_reg (code, X86_CC_NB, ins->dreg, FALSE); x86_patch (unordered_check, code); - x86_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg); - x86_patch (jump_to_end, code); - - if (ins->dreg != X86_EAX) - x86_pop_reg (code, X86_EAX); break; } case OP_FCGT: - case OP_FCGT_UN: - if (cfg->opt & MONO_OPT_FCMOV) { - /* zeroing the register at the start results in - * shorter and faster code (we can also remove the widening op) - */ - guchar *unordered_check; - x86_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg); - x86_fcomip (code, 1); - x86_fstp (code, 0); - if (ins->opcode == OP_FCGT) { - unordered_check = code; - x86_branch8 (code, X86_CC_P, 0, FALSE); - x86_set_reg (code, X86_CC_LT, ins->dreg, FALSE); - x86_patch (unordered_check, code); - } else { - x86_set_reg (code, X86_CC_LT, ins->dreg, FALSE); - } - break; - } - if (ins->dreg != X86_EAX) - x86_push_reg (code, X86_EAX); - - EMIT_FPCOMPARE(code); - x86_alu_reg_imm (code, X86_AND, X86_EAX, X86_FP_CC_MASK); - x86_alu_reg_imm (code, X86_CMP, X86_EAX, X86_FP_C0); - if (ins->opcode == OP_FCGT_UN) { - guchar *is_not_zero_check, *end_jump; - is_not_zero_check = code; - x86_branch8 (code, X86_CC_NZ, 0, TRUE); - end_jump = code; - x86_jump8 (code, 0); - x86_patch (is_not_zero_check, code); - x86_alu_reg_imm (code, X86_CMP, X86_EAX, X86_FP_CC_MASK); - - x86_patch (end_jump, code); - } - x86_set_reg (code, X86_CC_EQ, ins->dreg, TRUE); - x86_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE); - - if (ins->dreg != X86_EAX) - x86_pop_reg (code, X86_EAX); - break; - case OP_FCGE: { + case OP_FCGT_UN: { + /* zeroing the register at the start results in + * shorter and faster code (we can also remove the widening op) + */ guchar *unordered_check; - guchar *jump_to_end; - if (cfg->opt & MONO_OPT_FCMOV) { - /* zeroing the register at the start results in - * shorter and faster code (we can also remove the widening op) - */ - x86_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg); - x86_fcomip (code, 1); - x86_fstp (code, 0); + + x86_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg); + x86_sse_comisd_reg_reg (code, ins->sreg2, ins->sreg1); + if (ins->opcode == OP_FCGT) { unordered_check = code; x86_branch8 (code, X86_CC_P, 0, FALSE); - x86_set_reg (code, X86_CC_NA, ins->dreg, FALSE); + x86_set_reg (code, X86_CC_LT, ins->dreg, FALSE); x86_patch (unordered_check, code); - break; + } else { + x86_set_reg (code, X86_CC_LT, ins->dreg, FALSE); } - if (ins->dreg != X86_EAX) - x86_push_reg (code, X86_EAX); - - EMIT_FPCOMPARE(code); - x86_alu_reg_imm (code, X86_AND, X86_EAX, X86_FP_CC_MASK); - x86_alu_reg_imm (code, X86_CMP, X86_EAX, 0x4500); + break; + } + case OP_FCGE: { + guchar *unordered_check; + x86_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg); + x86_sse_comisd_reg_reg (code, ins->sreg2, ins->sreg1); unordered_check = code; - x86_branch8 (code, X86_CC_EQ, 0, FALSE); - - x86_alu_reg_imm (code, X86_CMP, X86_EAX, X86_FP_C0); - x86_set_reg (code, X86_CC_GE, ins->dreg, TRUE); - x86_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE); - jump_to_end = code; - x86_jump8 (code, 0); + x86_branch8 (code, X86_CC_P, 0, FALSE); + x86_set_reg (code, X86_CC_NA, ins->dreg, FALSE); x86_patch (unordered_check, code); - x86_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg); - x86_patch (jump_to_end, code); - - if (ins->dreg != X86_EAX) - x86_pop_reg (code, X86_EAX); break; } - case OP_FBEQ: - if (cfg->opt & MONO_OPT_FCMOV) { - guchar *jump = code; - x86_branch8 (code, X86_CC_P, 0, TRUE); - EMIT_COND_BRANCH (ins, X86_CC_EQ, FALSE); - x86_patch (jump, code); - break; - } - x86_alu_reg_imm (code, X86_CMP, X86_EAX, 0x4000); - EMIT_COND_BRANCH (ins, X86_CC_EQ, TRUE); + case OP_FBEQ: { + guchar *jump = code; + x86_branch8 (code, X86_CC_P, 0, TRUE); + EMIT_COND_BRANCH (ins, X86_CC_EQ, FALSE); + x86_patch (jump, code); break; + } case OP_FBNE_UN: /* Branch if C013 != 100 */ - if (cfg->opt & MONO_OPT_FCMOV) { - /* branch if !ZF or (PF|CF) */ - EMIT_COND_BRANCH (ins, X86_CC_NE, FALSE); - EMIT_COND_BRANCH (ins, X86_CC_P, FALSE); - EMIT_COND_BRANCH (ins, X86_CC_B, FALSE); - break; - } - x86_alu_reg_imm (code, X86_CMP, X86_EAX, X86_FP_C3); + /* branch if !ZF or (PF|CF) */ EMIT_COND_BRANCH (ins, X86_CC_NE, FALSE); + EMIT_COND_BRANCH (ins, X86_CC_P, FALSE); + EMIT_COND_BRANCH (ins, X86_CC_B, FALSE); break; case OP_FBLT: - if (cfg->opt & MONO_OPT_FCMOV) { - EMIT_COND_BRANCH (ins, X86_CC_GT, FALSE); - break; - } - EMIT_COND_BRANCH (ins, X86_CC_EQ, FALSE); + EMIT_COND_BRANCH (ins, X86_CC_GT, FALSE); break; case OP_FBLT_UN: - if (cfg->opt & MONO_OPT_FCMOV) { - EMIT_COND_BRANCH (ins, X86_CC_P, FALSE); - EMIT_COND_BRANCH (ins, X86_CC_GT, FALSE); - break; - } - if (ins->opcode == OP_FBLT_UN) { - guchar *is_not_zero_check, *end_jump; - is_not_zero_check = code; - x86_branch8 (code, X86_CC_NZ, 0, TRUE); - end_jump = code; - x86_jump8 (code, 0); - x86_patch (is_not_zero_check, code); - x86_alu_reg_imm (code, X86_CMP, X86_EAX, X86_FP_CC_MASK); - - x86_patch (end_jump, code); - } - EMIT_COND_BRANCH (ins, X86_CC_EQ, FALSE); + EMIT_COND_BRANCH (ins, X86_CC_P, FALSE); + EMIT_COND_BRANCH (ins, X86_CC_GT, FALSE); break; case OP_FBGT: case OP_FBGT_UN: - if (cfg->opt & MONO_OPT_FCMOV) { - if (ins->opcode == OP_FBGT) { - guchar *br1; - - /* skip branch if C1=1 */ - br1 = code; - x86_branch8 (code, X86_CC_P, 0, FALSE); - /* branch if (C0 | C3) = 1 */ - EMIT_COND_BRANCH (ins, X86_CC_LT, FALSE); - x86_patch (br1, code); - } else { - EMIT_COND_BRANCH (ins, X86_CC_LT, FALSE); - } - break; - } - x86_alu_reg_imm (code, X86_CMP, X86_EAX, X86_FP_C0); - if (ins->opcode == OP_FBGT_UN) { - guchar *is_not_zero_check, *end_jump; - is_not_zero_check = code; - x86_branch8 (code, X86_CC_NZ, 0, TRUE); - end_jump = code; - x86_jump8 (code, 0); - x86_patch (is_not_zero_check, code); - x86_alu_reg_imm (code, X86_CMP, X86_EAX, X86_FP_CC_MASK); - - x86_patch (end_jump, code); - } - EMIT_COND_BRANCH (ins, X86_CC_EQ, FALSE); - break; - case OP_FBGE: - /* Branch if C013 == 100 or 001 */ - if (cfg->opt & MONO_OPT_FCMOV) { + if (ins->opcode == OP_FBGT) { guchar *br1; /* skip branch if C1=1 */ br1 = code; x86_branch8 (code, X86_CC_P, 0, FALSE); /* branch if (C0 | C3) = 1 */ - EMIT_COND_BRANCH (ins, X86_CC_BE, FALSE); + EMIT_COND_BRANCH (ins, X86_CC_LT, FALSE); x86_patch (br1, code); break; + } else { + EMIT_COND_BRANCH (ins, X86_CC_LT, FALSE); } - x86_alu_reg_imm (code, X86_CMP, X86_EAX, X86_FP_C0); - EMIT_COND_BRANCH (ins, X86_CC_EQ, FALSE); - x86_alu_reg_imm (code, X86_CMP, X86_EAX, X86_FP_C3); - EMIT_COND_BRANCH (ins, X86_CC_EQ, FALSE); break; + case OP_FBGE: { + /* Branch if C013 == 100 or 001 */ + guchar *br1; + + /* skip branch if C1=1 */ + br1 = code; + x86_branch8 (code, X86_CC_P, 0, FALSE); + /* branch if (C0 | C3) = 1 */ + EMIT_COND_BRANCH (ins, X86_CC_BE, FALSE); + x86_patch (br1, code); + break; + } case OP_FBGE_UN: /* Branch if C013 == 000 */ - if (cfg->opt & MONO_OPT_FCMOV) { - EMIT_COND_BRANCH (ins, X86_CC_LE, FALSE); - break; - } - EMIT_COND_BRANCH (ins, X86_CC_NE, FALSE); + EMIT_COND_BRANCH (ins, X86_CC_LE, FALSE); break; - case OP_FBLE: + case OP_FBLE: { /* Branch if C013=000 or 100 */ - if (cfg->opt & MONO_OPT_FCMOV) { - guchar *br1; + guchar *br1; - /* skip branch if C1=1 */ - br1 = code; - x86_branch8 (code, X86_CC_P, 0, FALSE); - /* branch if C0=0 */ - EMIT_COND_BRANCH (ins, X86_CC_NB, FALSE); - x86_patch (br1, code); - break; - } - x86_alu_reg_imm (code, X86_AND, X86_EAX, (X86_FP_C0|X86_FP_C1)); - x86_alu_reg_imm (code, X86_CMP, X86_EAX, 0); - EMIT_COND_BRANCH (ins, X86_CC_EQ, FALSE); + /* skip branch if C1=1 */ + br1 = code; + x86_branch8 (code, X86_CC_P, 0, FALSE); + /* branch if C0=0 */ + EMIT_COND_BRANCH (ins, X86_CC_NB, FALSE); + x86_patch (br1, code); break; + } case OP_FBLE_UN: /* Branch if C013 != 001 */ - if (cfg->opt & MONO_OPT_FCMOV) { - EMIT_COND_BRANCH (ins, X86_CC_P, FALSE); - EMIT_COND_BRANCH (ins, X86_CC_GE, FALSE); + EMIT_COND_BRANCH (ins, X86_CC_P, FALSE); + EMIT_COND_BRANCH (ins, X86_CC_GE, FALSE); + break; + + case OP_RCEQ: + case OP_RCGT: + case OP_RCLT: + case OP_RCLT_UN: + case OP_RCGT_UN: { + int x86_cond; + + x86_alu_reg_reg (code, X86_XOR, ins->dreg, ins->dreg); + x86_sse_comiss_reg_reg (code, ins->sreg2, ins->sreg1); + + switch (ins->opcode) { + case OP_RCEQ: + x86_cond = X86_CC_EQ; + break; + case OP_RCGT: + x86_cond = X86_CC_LT; + break; + case OP_RCLT: + x86_cond = X86_CC_GT; + break; + case OP_RCLT_UN: + x86_cond = X86_CC_GT; + break; + case OP_RCGT_UN: + x86_cond = X86_CC_LT; + break; + default: + g_assert_not_reached (); + break; + } + + guchar *unordered_check; + + switch (ins->opcode) { + case OP_RCEQ: + case OP_RCGT: + unordered_check = code; + x86_branch8 (code, X86_CC_P, 0, FALSE); + x86_set_reg (code, x86_cond, ins->dreg, FALSE); + x86_patch (unordered_check, code); + break; + case OP_RCLT_UN: + case OP_RCGT_UN: { + guchar *jump_to_end; + + unordered_check = code; + x86_branch8 (code, X86_CC_P, 0, FALSE); + x86_set_reg (code, x86_cond, ins->dreg, FALSE); + jump_to_end = code; + x86_jump8 (code, 0); + x86_patch (unordered_check, code); + x86_inc_reg (code, ins->dreg); + x86_patch (jump_to_end, code); + break; + } + case OP_RCLT: + x86_set_reg (code, x86_cond, ins->dreg, FALSE); + break; + default: + g_assert_not_reached (); break; } - x86_alu_reg_imm (code, X86_CMP, X86_EAX, X86_FP_C0); - EMIT_COND_BRANCH (ins, X86_CC_NE, FALSE); break; + } + case OP_CKFINITE: { + MonoInst *var = get_double_spill_var (cfg); + /* Transfer value to the fp stack */ + x86_sse_movsd_membase_reg (code, var->inst_basereg, var->inst_offset, ins->sreg1); + x86_fld_membase (code, var->inst_basereg, var->inst_offset, TRUE); + guchar *br1; x86_push_reg (code, X86_EAX); x86_fxam (code); @@ -4102,6 +4037,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) x86_patch (br1, code); break; } + case OP_TLS_GET: { code = mono_x86_emit_tls_get (code, ins->dreg, ins->inst_offset); break; @@ -4327,6 +4263,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) x86_patch (br, code); break; } + #ifdef MONO_ARCH_SIMD_INTRINSICS case OP_ADDPS: x86_sse_alu_ps_reg_reg (code, X86_SSE_ADD, ins->sreg1, ins->sreg2); @@ -4389,23 +4326,23 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) case OP_PSHUFLEW_HIGH: g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 0xFF); - x86_pshufw_reg_reg (code, ins->dreg, ins->sreg1, ins->inst_c0, 1); + x86_sse_pshufw_reg_reg (code, ins->dreg, ins->sreg1, ins->inst_c0, 1); break; case OP_PSHUFLEW_LOW: g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 0xFF); - x86_pshufw_reg_reg (code, ins->dreg, ins->sreg1, ins->inst_c0, 0); + x86_sse_pshufw_reg_reg (code, ins->dreg, ins->sreg1, ins->inst_c0, 0); break; case OP_PSHUFLED: g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 0xFF); - x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->sreg1, ins->inst_c0); + x86_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0); break; case OP_SHUFPS: g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 0xFF); - x86_sse_alu_reg_reg_imm8 (code, X86_SSE_SHUFP, ins->sreg1, ins->sreg2, ins->inst_c0); + x86_sse_shufps_reg_reg_imm (code, ins->sreg1, ins->sreg2, ins->inst_c0); break; case OP_SHUFPD: g_assert (ins->inst_c0 >= 0 && ins->inst_c0 <= 0x3); - x86_sse_alu_pd_reg_reg_imm8 (code, X86_SSE_SHUFP, ins->sreg1, ins->sreg2, ins->inst_c0); + x86_sse_shufpd_reg_reg_imm (code, ins->sreg1, ins->sreg2, ins->inst_c0); break; case OP_ADDPD: @@ -4727,6 +4664,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) case OP_ICONV_TO_X: x86_movd_xreg_reg (code, ins->dreg, ins->sreg1); break; + case OP_ICONV_TO_R4_RAW: + x86_movd_xreg_reg_size (code, ins->dreg, ins->sreg1, 4); + break; case OP_EXTRACT_I4: x86_movd_reg_xreg (code, ins->dreg, ins->sreg1); break; @@ -4744,17 +4684,15 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) break; case OP_EXTRACT_R8: if (ins->inst_c0) - x86_sse_alu_pd_membase_reg (code, X86_SSE_MOVHPD_MEMBASE_REG, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, ins->sreg1); + x86_movhlps_reg_reg (code, ins->dreg, ins->sreg1); else - x86_sse_alu_sd_membase_reg (code, X86_SSE_MOVSD_MEMBASE_REG, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, ins->sreg1); - x86_fld_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, TRUE); + x86_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1); break; - case OP_INSERT_I2: - x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->sreg1, ins->sreg2, ins->inst_c0); + x86_sse_pinsrw_reg_reg_imm (code, ins->sreg1, ins->sreg2, ins->inst_c0); break; case OP_EXTRACTX_U2: - x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PEXTRW, ins->dreg, ins->sreg1, ins->inst_c0); + x86_sse_pextrw_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0); break; case OP_INSERTX_U1_SLOW: /*sreg1 is the extracted ireg (scratch) @@ -4768,30 +4706,42 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) x86_shift_reg_imm (code, X86_SHL, ins->sreg2, 8); /*join them together*/ x86_alu_reg_reg (code, X86_OR, ins->sreg1, ins->sreg2); - x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg1, ins->inst_c0 / 2); + x86_sse_pinsrw_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0 / 2); break; case OP_INSERTX_I4_SLOW: - x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg2, ins->inst_c0 * 2); + x86_sse_pinsrw_reg_reg_imm (code, ins->dreg, ins->sreg2, ins->inst_c0 * 2); x86_shift_reg_imm (code, X86_SHR, ins->sreg2, 16); - x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg2, ins->inst_c0 * 2 + 1); + x86_sse_pinsrw_reg_reg_imm (code, ins->dreg, ins->sreg2, ins->inst_c0 * 2 + 1); break; case OP_INSERTX_R4_SLOW: - x86_fst_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, FALSE, TRUE); - /*TODO if inst_c0 == 0 use movss*/ - x86_sse_alu_pd_reg_membase_imm (code, X86_SSE_PINSRW, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset + 0, ins->inst_c0 * 2); - x86_sse_alu_pd_reg_membase_imm (code, X86_SSE_PINSRW, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset + 2, ins->inst_c0 * 2 + 1); + switch (ins->inst_c0) { + case 0: + x86_sse_movss_reg_reg (code, ins->dreg, ins->sreg2); + break; + case 1: + x86_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(1, 0, 2, 3)); + x86_sse_movss_reg_reg (code, ins->dreg, ins->sreg2); + x86_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(1, 0, 2, 3)); + break; + case 2: + x86_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(2, 1, 0, 3)); + x86_sse_movss_reg_reg (code, ins->dreg, ins->sreg2); + x86_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(2, 1, 0, 3)); + break; + case 3: + x86_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(3, 1, 2, 0)); + x86_sse_movss_reg_reg (code, ins->dreg, ins->sreg2); + x86_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, mono_simd_shuffle_mask(3, 1, 2, 0)); + break; + } break; case OP_INSERTX_R8_SLOW: - x86_fst_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, TRUE, TRUE); - if (cfg->verbose_level) - printf ("CONVERTING a OP_INSERTX_R8_SLOW %d offset %x\n", ins->inst_c0, offset); if (ins->inst_c0) - x86_sse_alu_pd_reg_membase (code, X86_SSE_MOVHPD_REG_MEMBASE, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset); + x86_movlhps_reg_reg (code, ins->dreg, ins->sreg2); else - x86_movsd_reg_membase (code, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset); + x86_sse_movsd_reg_reg (code, ins->dreg, ins->sreg2); break; - case OP_STOREX_MEMBASE_REG: case OP_STOREX_MEMBASE: x86_movups_membase_reg (code, ins->dreg, ins->inst_offset, ins->sreg1); @@ -4823,30 +4773,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) case OP_XONES: x86_sse_alu_pd_reg_reg (code, X86_SSE_PCMPEQB, ins->dreg, ins->dreg); break; - case OP_FCONV_TO_R8_X: - x86_fst_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, TRUE, TRUE); - x86_movsd_reg_membase (code, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset); + x86_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1); break; - - case OP_XCONV_R8_TO_I4: - x86_cvttsd2si (code, ins->dreg, ins->sreg1); - switch (ins->backend.source_opcode) { - case OP_FCONV_TO_I1: - x86_widen_reg (code, ins->dreg, ins->dreg, TRUE, FALSE); - break; - case OP_FCONV_TO_U1: - x86_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE); - break; - case OP_FCONV_TO_I2: - x86_widen_reg (code, ins->dreg, ins->dreg, TRUE, TRUE); - break; - case OP_FCONV_TO_U2: - x86_widen_reg (code, ins->dreg, ins->dreg, FALSE, TRUE); - break; - } - break; - case OP_EXPAND_I2: x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg1, 0); x86_sse_alu_pd_reg_reg_imm (code, X86_SSE_PINSRW, ins->dreg, ins->sreg1, 1); @@ -4857,14 +4786,12 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->dreg, 0); break; case OP_EXPAND_R4: - x86_fst_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, FALSE, TRUE); - x86_movd_xreg_membase (code, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset); - x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->dreg, 0); + x86_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1); + x86_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, 0); break; case OP_EXPAND_R8: - x86_fst_membase (code, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset, TRUE, TRUE); - x86_movsd_reg_membase (code, ins->dreg, ins->backend.spill_var->inst_basereg, ins->backend.spill_var->inst_offset); - x86_sse_shift_reg_imm (code, X86_SSE_PSHUFD, ins->dreg, ins->dreg, 0x44); + x86_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1); + x86_sse_pshufd_reg_reg_imm (code, ins->dreg, ins->dreg, 0x44); break; case OP_CVTDQ2PD: @@ -4988,8 +4915,8 @@ mono_arch_patch_code_new (MonoCompile *cfg, guint8 *code, MonoJumpInfo *ji, gpoi break; case MONO_PATCH_INFO_R4: case MONO_PATCH_INFO_R8: { - guint32 offset = mono_arch_get_patch_offset (ip); - *((gconstpointer *)(ip + offset)) = target; + /* ip points at the memory address inside the instruction */ + *(gconstpointer *)ip = target; break; } default: { @@ -5653,26 +5580,6 @@ mono_arch_emit_inst_for_method (MonoCompile *cfg, MonoMethod *cmethod, MonoMetho int opcode = 0; if (cmethod->klass == mono_class_try_get_math_class ()) { - if (strcmp (cmethod->name, "Tan") == 0) { - opcode = OP_TAN; - } else if (strcmp (cmethod->name, "Atan") == 0) { - opcode = OP_ATAN; - } else if (strcmp (cmethod->name, "Sqrt") == 0) { - opcode = OP_SQRT; - } else if (strcmp (cmethod->name, "Abs") == 0 && fsig->params [0]->type == MONO_TYPE_R8) { - opcode = OP_ABS; - } else if (strcmp (cmethod->name, "Round") == 0 && fsig->param_count == 1 && fsig->params [0]->type == MONO_TYPE_R8) { - opcode = OP_ROUND; - } - - if (opcode && fsig->param_count == 1) { - MONO_INST_NEW (cfg, ins, opcode); - ins->type = STACK_R8; - ins->dreg = mono_alloc_freg (cfg); - ins->sreg1 = args [0]->dreg; - MONO_ADD_INS (cfg->cbb, ins); - } - if (cfg->opt & MONO_OPT_CMOV) { opcode = 0; @@ -5784,9 +5691,6 @@ mono_arch_get_this_arg_from_call (host_mgreg_t *regs, guint8 *code) { host_mgreg_t esp = regs [X86_ESP]; gpointer res; - int offset; - - offset = 0; /* * The stack looks like: @@ -6101,61 +6005,19 @@ mono_arch_context_set_int_reg (MonoContext *ctx, int reg, host_mgreg_t val) #ifdef MONO_ARCH_SIMD_INTRINSICS -static MonoInst* -get_float_to_x_spill_area (MonoCompile *cfg) -{ - if (!cfg->fconv_to_r8_x_var) { - cfg->fconv_to_r8_x_var = mono_compile_create_var (cfg, m_class_get_byval_arg (mono_defaults.double_class), OP_LOCAL); - cfg->fconv_to_r8_x_var->flags |= MONO_INST_VOLATILE; /*FIXME, use the don't regalloc flag*/ - } - return cfg->fconv_to_r8_x_var; -} - -/* - * Convert all fconv opts that MONO_OPT_SSE2 would get wrong. - */ void mono_arch_decompose_opts (MonoCompile *cfg, MonoInst *ins) { - MonoInst *fconv; - int dreg, src_opcode; - - if (!(cfg->opt & MONO_OPT_SSE2) || !(cfg->opt & MONO_OPT_SIMD) || COMPILE_LLVM (cfg)) - return; - - switch (src_opcode = ins->opcode) { - case OP_FCONV_TO_I1: - case OP_FCONV_TO_U1: - case OP_FCONV_TO_I2: - case OP_FCONV_TO_U2: - case OP_FCONV_TO_I4: + switch (ins->opcode) { + case OP_CKFINITE: + alloc_double_spill_var (cfg); break; default: - return; + break; } - - /* dreg is the IREG and sreg1 is the FREG */ - MONO_INST_NEW (cfg, fconv, OP_FCONV_TO_R8_X); - fconv->klass = NULL; /*FIXME, what can I use here as the Mono.Simd lib might not be loaded yet*/ - fconv->sreg1 = ins->sreg1; - fconv->dreg = mono_alloc_ireg (cfg); - fconv->type = STACK_VTYPE; - fconv->backend.spill_var = get_float_to_x_spill_area (cfg); - - mono_bblock_insert_before_ins (cfg->cbb, ins, fconv); - - dreg = ins->dreg; - NULLIFY_INS (ins); - ins->opcode = OP_XCONV_R8_TO_I4; - - ins->klass = mono_defaults.int32_class; - ins->sreg1 = fconv->dreg; - ins->dreg = dreg; - ins->type = STACK_I4; - ins->backend.source_opcode = src_opcode; } -#endif /* #ifdef MONO_ARCH_SIMD_INTRINSICS */ +#endif void mono_arch_decompose_long_opts (MonoCompile *cfg, MonoInst *long_ins) diff --git a/src/mono/mono/mini/mini-x86.h b/src/mono/mono/mini/mini-x86.h index a03a77a5cfc50..ed839af690732 100644 --- a/src/mono/mono/mini/mini-x86.h +++ b/src/mono/mono/mini/mini-x86.h @@ -79,16 +79,22 @@ LONG CALLBACK seh_handler(EXCEPTION_POINTERS* ep); #define MONO_ARCH_CALLEE_REGS X86_CALLEE_REGS #define MONO_ARCH_CALLEE_SAVED_REGS X86_CALLER_REGS -#define MONO_ARCH_CALLEE_FREGS (0xff & ~(regmask (MONO_ARCH_FPSTACK_SIZE))) +#ifdef TARGET_WIN32 +/* xmm7 is used as a scratch register */ +#define MONO_ARCH_CALLEE_FREGS 0x7f #define MONO_ARCH_CALLEE_SAVED_FREGS 0 +#define MONO_ARCH_FP_SCRATCH_REG X86_XMM7 +#else +/* xmm7 is used as a scratch register */ +#define MONO_ARCH_CALLEE_FREGS 0x7f +#define MONO_ARCH_CALLEE_SAVED_FREGS 0 +#define MONO_ARCH_FP_SCRATCH_REG X86_XMM7 +#endif /* All registers are clobered by a call */ -#define MONO_ARCH_CALLEE_XREGS (0xff & ~(regmask (MONO_MAX_XREGS))) +#define MONO_ARCH_CALLEE_XREGS MONO_ARCH_CALLEE_FREGS #define MONO_ARCH_CALLEE_SAVED_XREGS 0 -#define MONO_ARCH_USE_FPSTACK TRUE -#define MONO_ARCH_FPSTACK_SIZE 6 - #define MONO_ARCH_INST_FIXED_REG(desc) (((desc == ' ') || (desc == 'i')) ? -1 : ((desc == 's') ? X86_ECX : ((desc == 'a') ? X86_EAX : ((desc == 'd') ? X86_EDX : ((desc == 'l') ? X86_EAX : -1))))) #define MONO_ARCH_INST_FIXED_MASK(desc) ((desc == 'y') ? (X86_BYTE_REGS) : 0) @@ -172,6 +178,7 @@ typedef struct { #define MONO_ARCH_EMULATE_FCONV_TO_U8 1 #define MONO_ARCH_EMULATE_FCONV_TO_U4 1 +#define MONO_ARCH_EMULATE_FREM 1 #define MONO_ARCH_NEED_DIV_CHECK 1 #define MONO_ARCH_HAVE_IS_INT_OVERFLOW 1 @@ -222,6 +229,9 @@ typedef struct { #define MONO_ARCH_HAVE_OP_TAILCALL_REG 1 #define MONO_ARCH_HAVE_SDB_TRAMPOLINES 1 #define MONO_ARCH_LLVM_TARGET_LAYOUT "e-p:32:32-n32-S128" +#define MONO_ARCH_FLOAT32_SUPPORTED 1 +#define MONO_ARCH_NEED_SIMD_BANK 1 +#define MONO_ARCH_USE_SHARED_FP_SIMD_BANK 1 /* Used for optimization, not complete */ #define MONO_ARCH_IS_OP_MEMBASE(opcode) ((opcode) == OP_X86_PUSH_MEMBASE) diff --git a/src/mono/mono/mini/mini.c b/src/mono/mono/mini/mini.c index 61c9c76512db0..168d3d3a3c093 100644 --- a/src/mono/mono/mini/mini.c +++ b/src/mono/mono/mini/mini.c @@ -2983,8 +2983,6 @@ init_backend (MonoBackend *backend) #ifdef MONO_ARCH_GSHARED_SUPPORTED backend->gshared_supported = 1; #endif - if (MONO_ARCH_USE_FPSTACK) - backend->use_fpstack = 1; // Does the ABI have a volatile non-parameter register, so tailcall // can pass context to generics or interfaces? backend->have_volatile_non_param_register = MONO_ARCH_HAVE_VOLATILE_NON_PARAM_REGISTER; diff --git a/src/mono/mono/mini/mini.h b/src/mono/mono/mini/mini.h index e749854ab1ae3..3d02bd18a0230 100644 --- a/src/mono/mono/mini/mini.h +++ b/src/mono/mono/mini/mini.h @@ -1228,7 +1228,6 @@ typedef struct { gboolean have_op_tailcall_reg : 1; gboolean have_volatile_non_param_register : 1; guint gshared_supported : 1; - guint use_fpstack : 1; guint ilp32 : 1; guint need_got_var : 1; guint need_div_check : 1; diff --git a/src/mono/mono/mini/ssa.c b/src/mono/mono/mini/ssa.c index 4481f65d3c9b2..a7977f7359371 100644 --- a/src/mono/mono/mini/ssa.c +++ b/src/mono/mono/mini/ssa.c @@ -1414,8 +1414,7 @@ mono_ssa_deadce (MonoCompile *cfg) if (info->def && (!info->uses || ((info->uses->next == NULL) && (((MonoVarUsageInfo*)info->uses->data)->inst == info->def)))) { MonoInst *def = info->def; - /* Eliminating FMOVE could screw up the fp stack */ - if (MONO_IS_MOVE (def) && (!MONO_ARCH_USE_FPSTACK || (def->opcode != OP_FMOVE))) { + if (MONO_IS_MOVE (def)) { MonoInst *src_var = get_vreg_to_inst (cfg, def->sreg1); if (src_var && !(src_var->flags & (MONO_INST_VOLATILE|MONO_INST_INDIRECT))) add_to_dce_worklist (cfg, info, MONO_VARINFO (cfg, src_var->inst_c0), &work_list);