-
Notifications
You must be signed in to change notification settings - Fork 15k
[X86] Check MinMax has NaN and replace with NewX for minimumnum/maximumnum #164546
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
Thanks @wenju-he for the report! |
|
@llvm/pr-subscribers-backend-x86 Author: Phoebe Wang (phoebewang) ChangesIt is incorrect to just check for NewX and return its ordered elements. Patch is 104.96 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/164546.diff 2 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9580adebba712..6f25ce752325f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29502,16 +29502,23 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
DAG.isKnownNeverNaN(IsNum ? NewX : NewY))
std::swap(NewX, NewY);
+ if (!IgnoreNaN && IsNum && !DAG.isKnownNeverNaN(NewY)) {
+ SDValue IsOrdered = DAG.getSetCC(DL, SetCCType, NewX, NewX, ISD::SETO);
+ SDValue X = DAG.getSelect(DL, VT, IsOrdered, NewY, NewX);
+ SDValue Y = DAG.getSelect(DL, VT, IsOrdered, NewX, NewY);
+ NewX = X;
+ NewY = Y;
+ }
+
SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
- if (IgnoreNaN || DAG.isKnownNeverNaN(IsNum ? NewY : NewX))
+ if (IgnoreNaN || IsNum || DAG.isKnownNeverNaN(NewX))
return MinMax;
if (DAG.isKnownNeverNaN(NewX))
NewX = NewY;
- SDValue IsNaN =
- DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO);
+ SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NewX, NewX, ISD::SETUO);
return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
}
diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
index 0fe107cbbeee3..72eb565842ad1 100644
--- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
+++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
@@ -22,25 +22,28 @@ declare <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
define float @test_fmaximumnum(float %x, float %y) nounwind {
; SSE2-LABEL: test_fmaximumnum:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: js .LBB0_2
-; SSE2-NEXT: # %bb.1:
+; SSE2-NEXT: js .LBB0_1
+; SSE2-NEXT: # %bb.2:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: jmp .LBB0_3
+; SSE2-NEXT: .LBB0_1:
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: .LBB0_3:
+; SSE2-NEXT: ucomiss %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: .LBB0_2:
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: cmpordss %xmm3, %xmm0
-; SSE2-NEXT: movaps %xmm0, %xmm4
-; SSE2-NEXT: andps %xmm3, %xmm4
-; SSE2-NEXT: js .LBB0_4
-; SSE2-NEXT: # %bb.3:
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: .LBB0_4:
-; SSE2-NEXT: maxss %xmm1, %xmm3
-; SSE2-NEXT: andnps %xmm3, %xmm0
-; SSE2-NEXT: orps %xmm4, %xmm0
+; SSE2-NEXT: jnp .LBB0_5
+; SSE2-NEXT: # %bb.4:
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: .LBB0_5:
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: cmpordss %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: pandn %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: maxss %xmm3, %xmm0
; SSE2-NEXT: retq
;
; AVX1-LABEL: test_fmaximumnum:
@@ -55,9 +58,15 @@ define float @test_fmaximumnum(float %x, float %y) nounwind {
; AVX1-NEXT: vmovdqa %xmm1, %xmm2
; AVX1-NEXT: vmovdqa %xmm0, %xmm1
; AVX1-NEXT: .LBB0_3:
-; AVX1-NEXT: vmaxss %xmm2, %xmm1, %xmm0
-; AVX1-NEXT: vcmpordss %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vucomiss %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, %xmm0
+; AVX1-NEXT: jnp .LBB0_5
+; AVX1-NEXT: # %bb.4:
+; AVX1-NEXT: vmovdqa %xmm2, %xmm0
+; AVX1-NEXT: .LBB0_5:
+; AVX1-NEXT: vcmpordss %xmm1, %xmm1, %xmm3
+; AVX1-NEXT: vblendvps %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vmaxss %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: test_fmaximumnum:
@@ -66,12 +75,16 @@ define float @test_fmaximumnum(float %x, float %y) nounwind {
; AVX512-NEXT: testl %eax, %eax
; AVX512-NEXT: sets %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovdqa %xmm0, %xmm2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0
-; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT: vmovaps %xmm1, %xmm2
+; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1}
; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT: vucomiss %xmm2, %xmm2
+; AVX512-NEXT: setnp %al
+; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: vmovaps %xmm0, %xmm1
+; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT: vmaxss %xmm1, %xmm2, %xmm0
; AVX512-NEXT: retq
;
; AVX10_2-LABEL: test_fmaximumnum:
@@ -94,9 +107,15 @@ define float @test_fmaximumnum(float %x, float %y) nounwind {
; X86-NEXT: vmovdqa %xmm0, %xmm1
; X86-NEXT: vmovdqa %xmm2, %xmm0
; X86-NEXT: .LBB0_3:
-; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1
-; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2
-; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT: vucomiss %xmm0, %xmm0
+; X86-NEXT: vmovdqa %xmm0, %xmm2
+; X86-NEXT: jnp .LBB0_5
+; X86-NEXT: # %bb.4:
+; X86-NEXT: vmovdqa %xmm1, %xmm2
+; X86-NEXT: .LBB0_5:
+; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm3
+; X86-NEXT: vblendvps %xmm3, %xmm1, %xmm0, %xmm0
+; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: popl %eax
@@ -370,29 +389,43 @@ define double @test_fmaximumnum_zero2(double %x, double %y) {
define float @test_fmaximumnum_nsz(float %x, float %y) "no-signed-zeros-fp-math"="true" nounwind {
; SSE2-LABEL: test_fmaximumnum_nsz:
; SSE2: # %bb.0:
+; SSE2-NEXT: ucomiss %xmm0, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm3
+; SSE2-NEXT: jnp .LBB8_2
+; SSE2-NEXT: # %bb.1:
+; SSE2-NEXT: movaps %xmm1, %xmm3
+; SSE2-NEXT: .LBB8_2:
; SSE2-NEXT: movaps %xmm0, %xmm2
; SSE2-NEXT: cmpordss %xmm0, %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm3
-; SSE2-NEXT: andps %xmm0, %xmm3
-; SSE2-NEXT: maxss %xmm1, %xmm0
+; SSE2-NEXT: andps %xmm2, %xmm1
; SSE2-NEXT: andnps %xmm0, %xmm2
-; SSE2-NEXT: orps %xmm3, %xmm2
+; SSE2-NEXT: orps %xmm1, %xmm2
+; SSE2-NEXT: maxss %xmm3, %xmm2
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX1-LABEL: test_fmaximumnum_nsz:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2
-; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vucomiss %xmm0, %xmm0
+; AVX1-NEXT: vmovaps %xmm0, %xmm2
+; AVX1-NEXT: jnp .LBB8_2
+; AVX1-NEXT: # %bb.1:
+; AVX1-NEXT: vmovaps %xmm1, %xmm2
+; AVX1-NEXT: .LBB8_2:
+; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm3
+; AVX1-NEXT: vblendvps %xmm3, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmaxss %xmm2, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: test_fmaximumnum_nsz:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vcmpordss %xmm0, %xmm0, %k1
-; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vmovaps %xmm1, %xmm0
+; AVX512-NEXT: vucomiss %xmm0, %xmm0
+; AVX512-NEXT: setnp %al
+; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: vmovaps %xmm1, %xmm2
+; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT: vmaxss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX10_2-LABEL: test_fmaximumnum_nsz:
@@ -404,9 +437,16 @@ define float @test_fmaximumnum_nsz(float %x, float %y) "no-signed-zeros-fp-math"
; X86: # %bb.0:
; X86-NEXT: pushl %eax
; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm1
-; X86-NEXT: vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm2
-; X86-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
+; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: vucomiss %xmm1, %xmm1
+; X86-NEXT: vmovaps %xmm1, %xmm2
+; X86-NEXT: jnp .LBB8_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: vmovaps %xmm0, %xmm2
+; X86-NEXT: .LBB8_2:
+; X86-NEXT: vcmpordss %xmm1, %xmm1, %xmm3
+; X86-NEXT: vblendvps %xmm3, %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: popl %eax
@@ -421,23 +461,26 @@ define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind {
; SSE2-NEXT: divss %xmm0, %xmm1
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: movaps %xmm0, %xmm3
-; SSE2-NEXT: js .LBB9_2
-; SSE2-NEXT: # %bb.1:
-; SSE2-NEXT: movaps %xmm1, %xmm3
-; SSE2-NEXT: .LBB9_2:
-; SSE2-NEXT: movaps %xmm3, %xmm2
-; SSE2-NEXT: cmpordss %xmm3, %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm4
-; SSE2-NEXT: andps %xmm3, %xmm4
-; SSE2-NEXT: js .LBB9_4
-; SSE2-NEXT: # %bb.3:
+; SSE2-NEXT: js .LBB9_1
+; SSE2-NEXT: # %bb.2:
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: jmp .LBB9_3
+; SSE2-NEXT: .LBB9_1:
+; SSE2-NEXT: movaps %xmm1, %xmm2
; SSE2-NEXT: movaps %xmm0, %xmm1
-; SSE2-NEXT: .LBB9_4:
-; SSE2-NEXT: maxss %xmm1, %xmm3
-; SSE2-NEXT: andnps %xmm3, %xmm2
-; SSE2-NEXT: orps %xmm4, %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: .LBB9_3:
+; SSE2-NEXT: ucomiss %xmm1, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm3
+; SSE2-NEXT: jnp .LBB9_5
+; SSE2-NEXT: # %bb.4:
+; SSE2-NEXT: movaps %xmm2, %xmm3
+; SSE2-NEXT: .LBB9_5:
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: cmpordss %xmm1, %xmm0
+; SSE2-NEXT: andps %xmm0, %xmm2
+; SSE2-NEXT: andnps %xmm1, %xmm0
+; SSE2-NEXT: orps %xmm2, %xmm0
+; SSE2-NEXT: maxss %xmm3, %xmm0
; SSE2-NEXT: retq
;
; AVX1-LABEL: test_fmaximumnum_combine_cmps:
@@ -453,9 +496,15 @@ define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind {
; AVX1-NEXT: vmovaps %xmm1, %xmm2
; AVX1-NEXT: vmovaps %xmm0, %xmm1
; AVX1-NEXT: .LBB9_3:
-; AVX1-NEXT: vmaxss %xmm2, %xmm1, %xmm0
-; AVX1-NEXT: vcmpordss %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vucomiss %xmm1, %xmm1
+; AVX1-NEXT: vmovaps %xmm1, %xmm0
+; AVX1-NEXT: jnp .LBB9_5
+; AVX1-NEXT: # %bb.4:
+; AVX1-NEXT: vmovaps %xmm2, %xmm0
+; AVX1-NEXT: .LBB9_5:
+; AVX1-NEXT: vcmpordss %xmm1, %xmm1, %xmm3
+; AVX1-NEXT: vblendvps %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vmaxss %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX512F-LABEL: test_fmaximumnum_combine_cmps:
@@ -465,12 +514,16 @@ define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind {
; AVX512F-NEXT: testl %eax, %eax
; AVX512F-NEXT: sets %al
; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vmovaps %xmm0, %xmm2
-; AVX512F-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512F-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512F-NEXT: vmaxss %xmm2, %xmm1, %xmm0
-; AVX512F-NEXT: vcmpordss %xmm1, %xmm1, %k1
+; AVX512F-NEXT: vmovaps %xmm1, %xmm2
+; AVX512F-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1}
; AVX512F-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512F-NEXT: vucomiss %xmm2, %xmm2
+; AVX512F-NEXT: setnp %al
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovaps %xmm0, %xmm1
+; AVX512F-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512F-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1}
+; AVX512F-NEXT: vmaxss %xmm1, %xmm2, %xmm0
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_fmaximumnum_combine_cmps:
@@ -493,22 +546,28 @@ define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind {
; X86-LABEL: test_fmaximumnum_combine_cmps:
; X86: # %bb.0:
; X86-NEXT: pushl %eax
-; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vdivss %xmm1, %xmm0, %xmm0
-; X86-NEXT: vmovd %xmm1, %eax
+; X86-NEXT: vdivss %xmm2, %xmm0, %xmm0
+; X86-NEXT: vmovd %xmm2, %eax
; X86-NEXT: testl %eax, %eax
; X86-NEXT: js .LBB9_1
; X86-NEXT: # %bb.2:
-; X86-NEXT: vmovaps %xmm1, %xmm2
+; X86-NEXT: vmovaps %xmm2, %xmm1
; X86-NEXT: jmp .LBB9_3
; X86-NEXT: .LBB9_1:
-; X86-NEXT: vmovaps %xmm0, %xmm2
-; X86-NEXT: vmovaps %xmm1, %xmm0
+; X86-NEXT: vmovaps %xmm0, %xmm1
+; X86-NEXT: vmovaps %xmm2, %xmm0
; X86-NEXT: .LBB9_3:
-; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1
-; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2
-; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT: vucomiss %xmm0, %xmm0
+; X86-NEXT: vmovaps %xmm0, %xmm2
+; X86-NEXT: jnp .LBB9_5
+; X86-NEXT: # %bb.4:
+; X86-NEXT: vmovaps %xmm1, %xmm2
+; X86-NEXT: .LBB9_5:
+; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm3
+; X86-NEXT: vblendvps %xmm3, %xmm1, %xmm0, %xmm0
+; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: popl %eax
@@ -527,23 +586,27 @@ define float @test_fminimumnum(float %x, float %y) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: js .LBB10_2
-; SSE2-NEXT: # %bb.1:
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: .LBB10_2:
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: cmpordss %xmm3, %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm4
-; SSE2-NEXT: andps %xmm3, %xmm4
-; SSE2-NEXT: js .LBB10_4
-; SSE2-NEXT: # %bb.3:
+; SSE2-NEXT: js .LBB10_1
+; SSE2-NEXT: # %bb.2:
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: jmp .LBB10_3
+; SSE2-NEXT: .LBB10_1:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: .LBB10_4:
-; SSE2-NEXT: minss %xmm0, %xmm3
-; SSE2-NEXT: andnps %xmm3, %xmm2
-; SSE2-NEXT: orps %xmm4, %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: .LBB10_3:
+; SSE2-NEXT: ucomiss %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: jnp .LBB10_5
+; SSE2-NEXT: # %bb.4:
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: .LBB10_5:
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: cmpordss %xmm0, %xmm1
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: minss %xmm3, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; AVX1-LABEL: test_fminimumnum:
@@ -558,9 +621,15 @@ define float @test_fminimumnum(float %x, float %y) nounwind {
; AVX1-NEXT: vmovdqa %xmm0, %xmm2
; AVX1-NEXT: vmovdqa %xmm1, %xmm0
; AVX1-NEXT: .LBB10_3:
-; AVX1-NEXT: vminss %xmm2, %xmm0, %xmm1
-; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2
-; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vucomiss %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, %xmm1
+; AVX1-NEXT: jnp .LBB10_5
+; AVX1-NEXT: # %bb.4:
+; AVX1-NEXT: vmovdqa %xmm2, %xmm1
+; AVX1-NEXT: .LBB10_5:
+; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm3
+; AVX1-NEXT: vblendvps %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vminss %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: test_fminimumnum:
@@ -569,13 +638,16 @@ define float @test_fminimumnum(float %x, float %y) nounwind {
; AVX512-NEXT: testl %eax, %eax
; AVX512-NEXT: sets %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovaps %xmm1, %xmm2
-; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1}
-; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vminss %xmm2, %xmm0, %xmm1
-; AVX512-NEXT: vcmpordss %xmm0, %xmm0, %k1
+; AVX512-NEXT: vmovdqa %xmm0, %xmm2
+; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT: vucomiss %xmm2, %xmm2
+; AVX512-NEXT: setnp %al
+; AVX512-NEXT: kmovw %eax, %k1
; AVX512-NEXT: vmovaps %xmm1, %xmm0
+; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT: vminss %xmm0, %xmm2, %xmm0
; AVX512-NEXT: retq
;
; AVX10_2-LABEL: test_fminimumnum:
@@ -586,21 +658,27 @@ define float @test_fminimumnum(float %x, float %y) nounwind {
; X86-LABEL: test_fminimumnum:
; X86: # %bb.0:
; X86-NEXT: pushl %eax
-; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: vmovd %xmm0, %eax
; X86-NEXT: testl %eax, %eax
; X86-NEXT: js .LBB10_1
; X86-NEXT: # %bb.2:
-; X86-NEXT: vmovdqa %xmm1, %xmm2
+; X86-NEXT: vmovdqa %xmm2, %xmm1
; X86-NEXT: jmp .LBB10_3
; X86-NEXT: .LBB10_1:
-; X86-NEXT: vmovdqa %xmm0, %xmm2
-; X86-NEXT: vmovdqa %xmm1, %xmm0
+; X86-NEXT: vmovdqa %xmm0, %xmm1
+; X86-NEXT: vmovdqa %xmm2, %xmm0
; X86-NEXT: .LBB10_3:
-; X86-NEXT: vminss %xmm2, %xmm0, %xmm1
-; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2
-; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT: vucomiss %xmm0, %xmm0
+; X86-NEXT: vmovdqa %xmm0, %xmm2
+; X86-NEXT: jnp .LBB10_5
+; X86-NEXT: # %bb.4:
+; X86-NEXT: vmovdqa %xmm1, %xmm2
+; X86-NEXT: .LBB10_5:
+; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm3
+; X86-NEXT: vblendvps %xmm3, %xmm1, %xmm0, %xmm0
+; X86-NEXT: vminss %xmm2, %xmm0, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: popl %eax
@@ -856,29 +934,43 @@ define double @test_fminimumnum_zero2(double %x, double %y) {
define float @test_fminimumnum_nsz(float %x, float %y) nounwind {
; SSE2-LABEL: test_fminimumnum_nsz:
; SSE2: # %bb.0:
+; SSE2-NEXT: ucomiss %xmm0, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm3
+; SSE2-NEXT: jnp .LBB18_2
+; SSE2-NEXT: # %bb.1:
+; SSE2-NEXT: movaps %xmm1, %xmm3
+; SSE2-NEXT: .LBB18_2:
; SSE2-NEXT: movaps %xmm0, %xmm2
; SSE2-NEXT: cmpordss %xmm0, %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm3
-; SSE2-NEXT: andps %xmm0, %xmm3
-; SSE2-NEXT: minss %xmm1, %xmm0
+; SSE2-NEXT: andps %xmm2, %xmm1
; SSE2-NEXT: andnps %xmm0, %xmm2
-; SSE2-NEXT: orps %xmm3, %xmm2
+; SSE2-NEXT: orps %xmm1, %xmm2
+; SSE2-NEXT: minss %xmm3, %xmm2
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX1-LABEL: test_fminimumnum_nsz:
; AVX1: # %bb.0:
-; AVX1-NEXT: vminss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2
-; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vucomiss %xmm0, %xmm0
+; AVX1-NEXT: vmovaps %xmm0, %xmm2
+; AVX1-NEXT: jnp .LBB18_2
+; AVX1-NEXT: # %bb.1:
+; AVX1-NEXT: vmovaps %xmm1, %xmm2
+; AVX1-NEXT: .LBB18_2:
+; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm3
+; AVX1-NEXT: vblendvps %xmm3, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vminss %xmm2, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: test_fminimumnum_nsz:
; AVX512: # %bb.0:
-; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vcmpordss %xmm0, %xmm0, %k1
-; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vmovaps %xmm1, %xmm0
+; AVX512-NEXT: vucomiss %xmm0, %xmm0
+; AVX512-NEXT: setnp %al
+; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: vmovaps %xmm1, %xmm2
+; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT: vminss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX10_2-LABEL: test_fminimumnum_nsz:
@@ -890,9 +982,16 @@ define float @test_fminimumnum_nsz(float %x, float %y) nounwind {
; X86: # %bb.0:
; X86-NEXT: pushl %eax
; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm1
-; X86-NEXT: vminss {{[0-9]+}}(%esp), %xmm0, %xmm2
-; X86-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
+; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: vucomiss %xmm1, %xmm1
+; X86-NEXT: vmovaps %xmm1, %xmm2
+; X86-NEXT: jnp .LBB18_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: vmovaps %xmm0, %xmm2
+; X86-NEXT: .LBB18_2:
+; X86-NEXT: vcmpordss %xmm1, %xmm1, %xmm3
+; X86-NEXT: vblendvps %xmm3, %xmm0, %xmm1, %xmm0
+; X86-NEXT: vminss %xmm2, %xmm0, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: popl %eax
@@ -907,23 +1006,27 @@ define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind {
; SSE2-NEXT: divss %xmm0, %xmm1
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: movaps %xmm1, %xmm3
-; SSE2-NEXT: js .LBB19_2
-; SSE2-NEXT: # %bb.1:
+; SSE2-NEXT: js .LBB19_1
+; ...
[truncated]
|
|
Thanks @phoebewang, this PR can fix the failing tests in intel/llvm#20402 |
| return MinMax; | ||
|
|
||
| if (DAG.isKnownNeverNaN(NewX)) | ||
| NewX = NewY; | ||
|
|
||
| SDValue IsNaN = | ||
| DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO); | ||
| SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NewX, NewX, ISD::SETUO); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IsNum ? ISD::SETO : ISD::SETUO
->
ISD::SETUO
Why this change?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I optimized it with another approach. PTAL.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The prior code is to check if NewX has ordered elements. It's not correct for minimumnum/maximumnum. The new code check if the MinMax has unordered elements and repalce it with NewX.
It is incorrect to just check for NewX and return its ordered elements.
f021f3e to
e28163f
Compare
Thanks for the check @wenju-he! Sorry, I just have an idea to optimize the implementation. Could you help check it again? Thanks! |
the newer version still works. |
Thanks a lot! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. This logic is a bit hard to follow; hope it can be simplified using early returns to reduce complexity from line 29496 for IsNum=true in the future.
Simplified :) |
It is incorrect to just check for NewX and return its ordered elements.