diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td index e891fdba4e03e..2932bbf0e7bbd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -14,6 +14,10 @@ class CCIfNotInReg : CCIf<"!ArgFlags.isInReg()", A> {} class CCIfExtend : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>; +class CCIfOrigTypeShaderCCIsSGPR + : CCIf<[{(!OrigTy->getScalarType()->isFloatTy() && + !OrigTy->getScalarType()->isHalfTy()) }], A>; + // Calling convention for SI def CC_SI_Gfx : CallingConv<[ @@ -56,14 +60,15 @@ def CC_SI_SHADER : CallingConv<[ >>> ]>; + def RetCC_SI_Shader : CallingConv<[ CCIfType<[i1, i16], CCIfExtend>>, - CCIfType<[i32, i16, v2i16] , CCAssignToReg< + CCIfType<[i32, i16, v2i16], CCIfOrigTypeShaderCCIsSGPR("SGPR"#i)) // SGPR0-43 - >>, + >>>, // 32*4 + 4 is the minimum for a fetch shader with 32 outputs. - CCIfType<[f32, f16, v2f16, bf16, v2bf16] , CCAssignToReg< + CCIfType<[f32, f16, v2f16, bf16, v2bf16, i32, i16, v2i16] , CCAssignToReg< !foreach(i, !range(0, 136), !cast("VGPR"#i)) // VGPR0-135 >> ]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index c5e720ce26bc0..d61eb9d11937b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -481,7 +481,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64}, Custom); - setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); + setOperationAction({ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP}, MVT::f64, + Expand); setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom); const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 5756171147a61..0ba66af9bd41c 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1121,6 +1121,9 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32; } + if (!Subtarget->has16BitInsts() && VT.getSizeInBits() == 16) + return MVT::i32; + if (VT.getSizeInBits() > 32) return MVT::i32; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll index f48c72688533a..97fb83e0b6f45 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll @@ -1664,6 +1664,7 @@ define amdgpu_ps <2 x half> @fma_v2s16_uniform(<2 x half> inreg %a, <2 x half> i ; GFX6-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: fma_v2s16_uniform: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll index 5344095e99217..ed44b1c0b294a 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll @@ -15,22 +15,17 @@ define half @bitcast_i16_to_f16(i16 %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB0_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB0_4 -; SI-NEXT: .LBB0_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB0_3: ; %cmp.false +; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB0_2 -; SI-NEXT: .LBB0_4: ; %cmp.true +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i16_to_f16: @@ -125,6 +120,7 @@ define inreg half @bitcast_i16_to_f16_scalar(i16 inreg %a, i32 inreg %b) { ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 ; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB1_4: ; SI-NEXT: ; implicit-def: $vgpr0 @@ -199,8 +195,9 @@ define i16 @bitcast_f16_to_i16(half %a, i32 %b) { ; SI-LABEL: bitcast_f16_to_i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] @@ -294,8 +291,9 @@ define inreg i16 @bitcast_f16_to_i16_scalar(half inreg %a, i32 inreg %b) { ; SI-LABEL: bitcast_f16_to_i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 ; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_cbranch_scc0 .LBB3_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB3_3 @@ -408,6 +406,8 @@ define bfloat @bitcast_i16_to_bf16(i16 %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: ; %bb.2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i16_to_bf16: @@ -502,7 +502,8 @@ define inreg bfloat @bitcast_i16_to_bf16_scalar(i16 inreg %a, i32 inreg %b) { ; SI-NEXT: s_lshl_b32 s4, s6, 16 ; SI-NEXT: s_add_i32 s7, s4, 0x30000 ; SI-NEXT: .LBB5_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s7 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB5_4: ; SI-NEXT: ; implicit-def: $sgpr7 @@ -577,6 +578,7 @@ define i16 @bitcast_bf16_to_i16(bfloat %a, i32 %b) { ; SI-LABEL: bitcast_bf16_to_i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v0 ; SI-NEXT: ; implicit-def: $vgpr0 @@ -720,8 +722,9 @@ define inreg i16 @bitcast_bf16_to_i16_scalar(bfloat inreg %a, i32 inreg %b) { ; SI-LABEL: bitcast_bf16_to_i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_lshl_b32 s4, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s17, 0 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: s_cbranch_scc0 .LBB7_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 @@ -835,29 +838,27 @@ define bfloat @bitcast_f16_to_bf16(half %a, i32 %b) { ; SI-LABEL: bitcast_f16_to_bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB8_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB8_4 -; SI-NEXT: .LBB8_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB8_3: ; %cmp.false +; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB8_2 -; SI-NEXT: .LBB8_4: ; %cmp.true +; SI-NEXT: s_cbranch_execz .LBB8_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: .LBB8_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f16_to_bf16: @@ -942,21 +943,24 @@ define inreg bfloat @bitcast_f16_to_bf16_scalar(half inreg %a, i32 inreg %b) { ; SI-LABEL: bitcast_f16_to_bf16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 ; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_cbranch_scc0 .LBB9_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; SI-NEXT: s_cbranch_execnz .LBB9_3 ; SI-NEXT: .LBB9_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB9_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB9_2 ; ; VI-LABEL: bitcast_f16_to_bf16_scalar: @@ -1049,30 +1053,26 @@ define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) { ; SI-LABEL: bitcast_bf16_to_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB10_3 -; SI-NEXT: ; %bb.1: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execnz .LBB10_4 -; SI-NEXT: .LBB10_2: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB10_3: ; %cmp.false +; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; %bb.2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB10_2 -; SI-NEXT: .LBB10_4: ; %cmp.true +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: ; %bb.4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_bf16_to_f16: @@ -1194,22 +1194,24 @@ define inreg half @bitcast_bf16_to_f16_scalar(bfloat inreg %a, i32 inreg %b) { ; SI-LABEL: bitcast_bf16_to_f16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_lshl_b32 s4, s16, 16 ; SI-NEXT: s_cmp_lg_u32 s17, 0 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s4 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: s_cbranch_execnz .LBB11_3 ; SI-NEXT: .LBB11_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB11_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB11_2 ; ; VI-LABEL: bitcast_bf16_to_f16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index daa771a843ee6..0394ed7f89633 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -2280,12 +2280,10 @@ define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) { ; GCN-LABEL: test_arg_store: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -2294,11 +2292,9 @@ define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -2681,12 +2677,11 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) ; GCN-LABEL: test_inreg_arg_store: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s39, 0xf000 ; GCN-NEXT: s_mov_b32 s38, 0 -; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; GCN-NEXT: s_mov_b32 s39, 0xf000 +; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: s_mov_b32 s36, s38 ; GCN-NEXT: s_mov_b32 s37, s38 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GCN-NEXT: buffer_store_short v2, v[0:1], s[36:39], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -2695,11 +2690,10 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s38, 0 -; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; GFX7-NEXT: s_mov_b32 s39, 0xf000 ; GFX7-NEXT: s_mov_b32 s36, s38 ; GFX7-NEXT: s_mov_b32 s37, s38 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: buffer_store_short v2, v[0:1], s[36:39], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -2764,18 +2758,14 @@ define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) { ; GCN-LABEL: test_byval: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_store_short v1, off, s[0:3], s32 +; GCN-NEXT: buffer_store_short v0, off, s[0:3], s32 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_byval: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: buffer_store_short v1, off, s[0:3], s32 +; GFX7-NEXT: buffer_store_short v0, off, s[0:3], s32 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2842,8 +2832,6 @@ define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) { ; GCN-LABEL: test_sret: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -2851,8 +2839,6 @@ define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) { ; GFX7-LABEL: test_sret: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -3327,8 +3313,6 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v2, 1 @@ -3358,8 +3342,6 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX7-NEXT: v_writelane_b32 v2, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v2, 1 @@ -5068,25 +5050,19 @@ define bfloat @test_alloca_load_store_ret(bfloat %in) { ; GCN-LABEL: test_alloca_load_store_ret: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: buffer_store_short v0, off, s[0:3], s32 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_alloca_load_store_ret: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], s32 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_alloca_load_store_ret: @@ -5199,7 +5175,6 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { ; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x44, v0 ; GCN-NEXT: buffer_store_dword v25, v31, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) @@ -5234,7 +5209,6 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { ; GCN-NEXT: v_add_i32_e32 v18, vcc, 8, v0 ; GCN-NEXT: v_add_i32_e32 v25, vcc, 4, v0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x80, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: buffer_store_dword v17, v31, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v15, v24, s[0:3], 0 offen @@ -5260,8 +5234,6 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { ; GFX7-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; GFX7-NEXT: v_add_i32_e32 v31, vcc, 0x7c, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 @@ -9509,23 +9481,19 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fadd_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_add_f32_e32 v0, v0, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fadd_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fadd_bf16: @@ -13684,19 +13652,17 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) { ; GCN-LABEL: v_fadd_bf16_fpimm_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fadd_bf16_fpimm_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fadd_bf16_fpimm_0: @@ -13809,19 +13775,17 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) { ; GCN-LABEL: v_fadd_bf16_fpimm_1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_add_f32_e32 v0, 0x42280000, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fadd_bf16_fpimm_1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_add_f32_e32 v0, 0x42280000, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fadd_bf16_fpimm_1: @@ -13934,23 +13898,19 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fsub_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_sub_f32_e32 v0, v0, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fsub_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fsub_bf16: @@ -14792,23 +14752,19 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fmul_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fmul_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fmul_bf16: @@ -18964,10 +18920,8 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fdiv_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GCN-NEXT: v_rcp_f32_e32 v3, v2 ; GCN-NEXT: v_fma_f32 v4, -v2, v3, 1.0 @@ -18979,16 +18933,14 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GCN-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GCN-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GCN-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fdiv_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 ; GFX7-NEXT: v_rcp_f32_e32 v3, v2 ; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0 @@ -19000,7 +18952,7 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_bf16: @@ -19227,15 +19179,13 @@ define bfloat @v_fabs_bf16(bfloat %a) { ; GCN-LABEL: v_fabs_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fabs_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fabs_bf16: @@ -19288,16 +19238,12 @@ define bfloat @v_fabs_bf16(bfloat %a) { define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) { ; GCN-LABEL: s_fabs_bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 -; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_and_b32 s0, s0, 0x7fff ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_fabs_bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: s_and_b32 s0, s0, 0x7fff ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fabs_bf16: @@ -19343,13 +19289,13 @@ define bfloat @v_fneg_bf16(bfloat %a) { ; GCN-LABEL: v_fneg_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fneg_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX7-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fneg_bf16: @@ -19405,16 +19351,14 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) { ; GCN-LABEL: s_fneg_bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v0, -1.0, s0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_xor_b32 s0, s0, 0x8000 +; GCN-NEXT: s_and_b32 s0, s0, 0xffff ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_fneg_bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v0, -1.0, s0 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: s_xor_b32 s0, s0, 0x8000 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fneg_bf16: @@ -19460,17 +19404,13 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) { ; GCN-LABEL: v_fneg_fabs_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_or_b32_e32 v0, 0x80000000, v0 +; GCN-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fneg_fabs_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_or_b32_e32 v0, 0x80000000, v0 +; GFX7-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fneg_fabs_bf16: @@ -19525,18 +19465,14 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) { define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) { ; GCN-LABEL: s_fneg_fabs_bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v0, 0x8000, v0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_bitset1_b32 s0, 15 +; GCN-NEXT: s_and_b32 s0, s0, 0xffff ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_fneg_fabs_bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, 0x8000, v0 -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: s_bitset1_b32 s0, 15 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fneg_fabs_bf16: @@ -19591,23 +19527,23 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_minnum_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v0, v0, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_minnum_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minnum_bf16: @@ -24035,23 +23971,23 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_maxnum_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v0, v0, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_maxnum_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maxnum_bf16: @@ -28472,10 +28408,9 @@ define bfloat @v_sqrt_bf16(bfloat %a) { ; GCN-LABEL: v_sqrt_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_mov_b32 s4, 0xf800000 ; GCN-NEXT: v_mov_b32_e32 v1, 0x260 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc @@ -28492,14 +28427,13 @@ define bfloat @v_sqrt_bf16(bfloat %a) { ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GCN-NEXT: v_cmp_class_f32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_sqrt_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_mov_b32 s4, 0xf800000 ; GFX7-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 @@ -28518,7 +28452,7 @@ define bfloat @v_sqrt_bf16(bfloat %a) { ; GFX7-NEXT: v_mov_b32_e32 v2, 0x260 ; GFX7-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_sqrt_bf16: @@ -28737,10 +28671,9 @@ define bfloat @v_rsq_bf16(bfloat %x) { ; GCN-LABEL: v_rsq_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_mov_b32 s4, 0xf800000 ; GCN-NEXT: v_mov_b32_e32 v1, 0x260 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc @@ -28769,14 +28702,13 @@ define bfloat @v_rsq_bf16(bfloat %x) { ; GCN-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GCN-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GCN-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_rsq_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_mov_b32 s4, 0xf800000 ; GFX7-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 @@ -28807,7 +28739,7 @@ define bfloat @v_rsq_bf16(bfloat %x) { ; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX7-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rsq_bf16: @@ -29144,10 +29076,9 @@ define bfloat @v_neg_rsq_bf16(bfloat %x) { ; GCN-LABEL: v_neg_rsq_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_mov_b32 s4, 0xf800000 ; GCN-NEXT: v_mov_b32_e32 v1, 0x260 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc @@ -29176,14 +29107,13 @@ define bfloat @v_neg_rsq_bf16(bfloat %x) { ; GCN-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GCN-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GCN-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_neg_rsq_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_mov_b32 s4, 0xf800000 ; GFX7-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 @@ -29214,7 +29144,7 @@ define bfloat @v_neg_rsq_bf16(bfloat %x) { ; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX7-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_neg_rsq_bf16: @@ -29559,19 +29489,17 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) { ; GCN-LABEL: v_ldexp_bf16_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_ldexp_bf16_i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ldexp_bf16_i32: @@ -29686,25 +29614,24 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) { ; GCN-LABEL: v_frexp_bf16_i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_mov_b32 s4, 0x7f800000 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_frexp_mant_f32_e32 v1, v0 ; GCN-NEXT: v_frexp_exp_i32_f32_e32 v2, v0 ; GCN-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_frexp_bf16_i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v0 -; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_frexp_bf16_i16: @@ -29830,11 +29757,10 @@ define bfloat @v_log_bf16(bfloat %a) { ; GCN-LABEL: v_log_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_mov_b32 s4, 0x800000 ; GCN-NEXT: s_mov_b32 s5, 0x7f800000 ; GCN-NEXT: v_mov_b32_e32 v1, 0x41b17218 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2 @@ -29852,14 +29778,13 @@ define bfloat @v_log_bf16(bfloat %a) { ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: v_sub_f32_e32 v0, v0, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_log_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_mov_b32 s4, 0x800000 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc @@ -29877,7 +29802,7 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX7-NEXT: v_mov_b32_e32 v1, 0x41b17218 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_log_bf16: @@ -30116,24 +30041,22 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GCN-LABEL: v_log2_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_mov_b32 s4, 0x800000 ; GCN-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2 ; GCN-NEXT: v_log_f32_e32 v0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: v_sub_f32_e32 v0, v0, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_log2_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_mov_b32 s4, 0x800000 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc @@ -30142,7 +30065,7 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX7-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_log2_bf16: @@ -30289,11 +30212,10 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GCN-LABEL: v_log10_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_mov_b32 s4, 0x800000 ; GCN-NEXT: s_mov_b32 s5, 0x7f800000 ; GCN-NEXT: v_mov_b32_e32 v1, 0x411a209b -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2 @@ -30311,14 +30233,13 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: v_sub_f32_e32 v0, v0, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_log10_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_mov_b32 s4, 0x800000 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc @@ -30336,7 +30257,7 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX7-NEXT: v_mov_b32_e32 v1, 0x411a209b ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_log10_bf16: @@ -30579,11 +30500,10 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GCN-LABEL: v_exp_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_mov_b32 s4, 0xc2ce8ed0 ; GCN-NEXT: s_mov_b32 s5, 0x42b17218 ; GCN-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v0 ; GCN-NEXT: v_sub_f32_e32 v3, v0, v0 ; GCN-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v0 @@ -30601,14 +30521,13 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_exp_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_mov_b32 s4, 0x3fb8aa3b ; GFX7-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 ; GFX7-NEXT: v_fma_f32 v2, v0, s4, -v1 @@ -30627,7 +30546,7 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_exp_bf16: @@ -30876,25 +30795,23 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GCN-LABEL: v_exp2_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_mov_b32 s4, 0xc2fc0000 ; GCN-NEXT: v_mov_b32_e32 v1, 0x42800000 ; GCN-NEXT: v_not_b32_e32 v2, 63 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-NEXT: v_exp_f32_e32 v0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_exp2_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_mov_b32 s4, 0xc2fc0000 ; GFX7-NEXT: v_mov_b32_e32 v1, 0x42800000 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 @@ -30904,7 +30821,7 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX7-NEXT: v_not_b32_e32 v1, 63 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_exp2_bf16: @@ -31053,11 +30970,10 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GCN-LABEL: v_exp10_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_mov_b32 s4, 0xc23369f4 ; GCN-NEXT: s_mov_b32 s5, 0x421a209b ; GCN-NEXT: v_mov_b32_e32 v1, 0x7f800000 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x40549000, v0 ; GCN-NEXT: v_sub_f32_e32 v3, v0, v0 ; GCN-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v0 @@ -31075,14 +30991,13 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_exp10_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_mov_b32 s4, 0x40549a78 ; GFX7-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 ; GFX7-NEXT: v_fma_f32 v2, v0, s4, -v1 @@ -31101,7 +31016,7 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7f800000 ; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_exp10_bf16: @@ -31352,19 +31267,17 @@ define bfloat @v_ceil_bf16(bfloat %a) { ; GCN-LABEL: v_ceil_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_ceil_f32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_ceil_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_ceil_f32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ceil_bf16: @@ -31479,19 +31392,17 @@ define bfloat @v_trunc_bf16(bfloat %a) { ; GCN-LABEL: v_trunc_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_trunc_f32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_trunc_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_trunc_f32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_trunc_bf16: @@ -31606,19 +31517,17 @@ define bfloat @v_rint_bf16(bfloat %a) { ; GCN-LABEL: v_rint_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_rndne_f32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_rint_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_rndne_f32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rint_bf16: @@ -31733,19 +31642,17 @@ define bfloat @v_nearbyint_bf16(bfloat %a) { ; GCN-LABEL: v_nearbyint_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_rndne_f32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_nearbyint_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_rndne_f32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_nearbyint_bf16: @@ -31860,8 +31767,7 @@ define bfloat @v_round_bf16(bfloat %a) { ; GCN-LABEL: v_round_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v0 ; GCN-NEXT: v_sub_f32_e32 v2, v0, v1 ; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 @@ -31869,14 +31775,13 @@ define bfloat @v_round_bf16(bfloat %a) { ; GCN-NEXT: s_brev_b32 s4, -2 ; GCN-NEXT: v_bfi_b32 v0, s4, v2, v0 ; GCN-NEXT: v_add_f32_e32 v0, v1, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_round_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_trunc_f32_e32 v1, v0 ; GFX7-NEXT: v_sub_f32_e32 v2, v0, v1 ; GFX7-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 @@ -31884,7 +31789,7 @@ define bfloat @v_round_bf16(bfloat %a) { ; GFX7-NEXT: s_brev_b32 s4, -2 ; GFX7-NEXT: v_bfi_b32 v0, s4, v2, v0 ; GFX7-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_round_bf16: @@ -32053,19 +31958,17 @@ define bfloat @v_roundeven_bf16(bfloat %a) { ; GCN-LABEL: v_roundeven_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_rndne_f32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_roundeven_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_rndne_f32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_roundeven_bf16: @@ -32180,19 +32083,17 @@ define bfloat @v_floor_bf16(bfloat %a) { ; GCN-LABEL: v_floor_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_floor_f32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_floor_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_floor_f32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_floor_bf16: @@ -32307,17 +32208,17 @@ define bfloat @v_canonicalize_bf16(bfloat %a) { ; GCN-LABEL: v_canonicalize_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_canonicalize_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_canonicalize_bf16: @@ -32485,10 +32386,8 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fcmp_oeq_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -32496,10 +32395,8 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fcmp_oeq_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -32593,10 +32490,8 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fcmp_ogt_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -32604,10 +32499,8 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fcmp_ogt_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -32701,10 +32594,8 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fcmp_oge_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -32712,10 +32603,8 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fcmp_oge_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -32809,10 +32698,8 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fcmp_olt_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -32820,10 +32707,8 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fcmp_olt_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -32917,10 +32802,8 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fcmp_ole_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_cmp_le_f32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -32928,10 +32811,8 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fcmp_ole_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_cmp_le_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -33025,10 +32906,8 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fcmp_one_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -33036,10 +32915,8 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fcmp_one_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -33133,10 +33010,8 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fcmp_uno_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -33144,10 +33019,8 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fcmp_uno_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -33241,10 +33114,8 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fcmp_ueq_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -33252,10 +33123,8 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fcmp_ueq_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -33349,10 +33218,8 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fcmp_ugt_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -33360,10 +33227,8 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fcmp_ugt_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -33457,10 +33322,8 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fcmp_uge_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -33468,10 +33331,8 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fcmp_uge_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -33565,10 +33426,8 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fcmp_ult_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -33576,10 +33435,8 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fcmp_ult_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -33673,10 +33530,8 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fcmp_ule_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -33684,10 +33539,8 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fcmp_ule_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -33781,10 +33634,8 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) { ; GCN-LABEL: v_fcmp_une_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -33792,10 +33643,8 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) { ; GFX7-LABEL: v_fcmp_une_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -33936,16 +33785,14 @@ define i16 @v_fptosi_bf16_to_i16(bfloat %x) { ; GCN-LABEL: v_fptosi_bf16_to_i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fptosi_bf16_to_i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -34336,16 +34183,14 @@ define i32 @v_fptosi_bf16_to_i32(bfloat %x) { ; GCN-LABEL: v_fptosi_bf16_to_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fptosi_bf16_to_i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -34682,10 +34527,9 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) { ; GCN-LABEL: v_fptosi_bf16_to_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_mov_b32 s4, 0x2f800000 ; GCN-NEXT: s_mov_b32 s5, 0xcf800000 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_trunc_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e64 v1, |v0|, s4 ; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v0 @@ -34702,8 +34546,7 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) { ; GFX7-LABEL: v_fptosi_bf16_to_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_trunc_f32_e32 v0, v0 ; GFX7-NEXT: s_mov_b32 s4, 0x2f800000 ; GFX7-NEXT: v_mul_f32_e64 v1, |v0|, s4 @@ -35968,7 +35811,7 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_sitofp_i16_to_bf16: @@ -35976,7 +35819,7 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_sitofp_i16_to_bf16: @@ -36699,14 +36542,14 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_sitofp_i32_to_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_sitofp_i32_to_bf16: @@ -37372,7 +37215,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GCN-NEXT: v_sub_i32_e32 v1, vcc, 32, v2 ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_sitofp_i64_to_bf16: @@ -37390,7 +37233,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 32, v2 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_sitofp_i64_to_bf16: @@ -39130,7 +38973,7 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_uitofp_i16_to_bf16: @@ -39138,7 +38981,7 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uitofp_i16_to_bf16: @@ -39920,14 +39763,14 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_uitofp_i32_to_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uitofp_i32_to_bf16: @@ -40589,7 +40432,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) { ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GCN-NEXT: v_sub_i32_e32 v1, vcc, 32, v2 ; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_uitofp_i64_to_bf16: @@ -40603,7 +40446,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) { ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 32, v2 ; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uitofp_i64_to_bf16: @@ -41969,23 +41812,17 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GCN-LABEL: v_select_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_select_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_select_bf16: @@ -42066,23 +41903,19 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GCN-LABEL: v_select_fneg_lhs_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_mul_f32_e32 v1, -1.0, v1 +; GCN-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_select_fneg_lhs_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v1, -1.0, v1 +; GFX7-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_select_fneg_lhs_bf16: @@ -42172,23 +42005,19 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GCN-LABEL: v_select_fneg_rhs_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_mul_f32_e32 v2, -1.0, v2 +; GCN-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_select_fneg_rhs_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, -1.0, v2 +; GFX7-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_select_fneg_rhs_bf16: @@ -42537,21 +42366,21 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) { ; GCN-LABEL: s_select_bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0 -; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_select_bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0 -; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; @@ -47346,27 +47175,21 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GCN-LABEL: v_fma_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_fma_f32 v0, v0, v1, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fma_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_fma_f32 v0, v0, v1, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_bf16: @@ -52328,31 +52151,25 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GCN-LABEL: v_fmuladd_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GCN-NEXT: v_add_f32_e32 v0, v0, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fmuladd_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fmuladd_bf16: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index 3546141afe5bb..d8ef44361c40d 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -3672,13 +3672,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3702,7 +3703,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -3712,13 +3712,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v4, s4 ; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3743,7 +3744,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 @@ -4100,13 +4100,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -4138,13 +4139,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -4830,11 +4832,12 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_and_b32_e32 v9, -4, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 -; GFX7-NEXT: v_not_b32_e32 v9, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 +; GFX7-NEXT: v_not_b32_e32 v10, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -4845,28 +4848,28 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v6 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v4 ; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v8, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 +; GFX7-NEXT: v_and_b32_e32 v6, v5, v10 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v11 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: v_mov_b32_e32 v5, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -4878,33 +4881,33 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB15_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v6 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB15_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v8, v6 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: v_and_b32_e32 v9, -4, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 -; GFX6-NEXT: v_not_b32_e32 v9, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 +; GFX6-NEXT: v_not_b32_e32 v10, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -4915,28 +4918,29 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX6-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v6 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v4 ; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB15_4 Depth 2 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v6, v5, v10 ; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v11 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: v_mov_b32_e32 v5, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -4948,22 +4952,21 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB15_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v6 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB15_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v6 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 @@ -5416,6 +5419,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -5445,7 +5449,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -5456,6 +5459,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: v_mov_b32_e32 v4, s4 ; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -5486,7 +5490,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 @@ -5926,6 +5929,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -5964,6 +5968,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -6738,11 +6743,12 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX7-NEXT: v_and_b32_e32 v9, -4, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 -; GFX7-NEXT: v_not_b32_e32 v9, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX7-NEXT: v_not_b32_e32 v10, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -6753,28 +6759,28 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v6 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v4 ; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v8, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX7-NEXT: v_add_f32_e32 v4, v4, v11 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX7-NEXT: v_and_b32_e32 v6, v5, v10 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_mov_b32_e32 v5, v6 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -6786,33 +6792,33 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB18_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v6 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB18_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v8, v6 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX6-NEXT: v_and_b32_e32 v9, -4, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 -; GFX6-NEXT: v_not_b32_e32 v9, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX6-NEXT: v_not_b32_e32 v10, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -6823,28 +6829,29 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX6-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v6 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v11, 0xffff0000, v4 ; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB18_4 Depth 2 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v11 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 -; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v6, v5, v10 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_mov_b32_e32 v5, v6 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -6856,22 +6863,21 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB18_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v6 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB18_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v6 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index 7896edd5016f0..fc3ed6d332211 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -2773,13 +2773,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2803,7 +2804,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2813,13 +2813,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v4, s4 ; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2844,7 +2845,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 @@ -3222,13 +3222,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3260,13 +3261,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3974,11 +3976,12 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_and_b32_e32 v9, -4, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 -; GFX7-NEXT: v_not_b32_e32 v9, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 +; GFX7-NEXT: v_not_b32_e32 v10, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -3989,28 +3992,28 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v6 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v4 ; GFX7-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v8, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 +; GFX7-NEXT: v_and_b32_e32 v6, v5, v10 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_max_f32_e32 v4, v4, v10 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v11 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: v_mov_b32_e32 v5, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -4022,33 +4025,33 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB12_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v6 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB12_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v8, v6 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: v_and_b32_e32 v9, -4, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 -; GFX6-NEXT: v_not_b32_e32 v9, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 +; GFX6-NEXT: v_not_b32_e32 v10, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -4059,28 +4062,29 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX6-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v6 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v4 ; GFX6-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB12_4 Depth 2 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v6, v5, v10 ; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_max_f32_e32 v4, v4, v10 +; GFX6-NEXT: v_max_f32_e32 v4, v4, v11 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: v_mov_b32_e32 v5, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -4092,22 +4096,21 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB12_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v6 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB12_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v6 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 @@ -4560,6 +4563,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -4590,7 +4594,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4601,6 +4604,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: v_mov_b32_e32 v4, s4 ; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -4632,7 +4636,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 @@ -5072,6 +5075,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -5111,6 +5115,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -5886,11 +5891,12 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX7-NEXT: v_and_b32_e32 v9, -4, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 -; GFX7-NEXT: v_not_b32_e32 v9, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX7-NEXT: v_not_b32_e32 v10, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -5901,29 +5907,29 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v6 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v4 ; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v8, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_max_f32_e32 v4, v4, v10 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v11 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX7-NEXT: v_and_b32_e32 v6, v5, v10 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_mov_b32_e32 v5, v6 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -5935,33 +5941,33 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB15_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v6 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB15_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v8, v6 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX6-NEXT: v_and_b32_e32 v9, -4, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 -; GFX6-NEXT: v_not_b32_e32 v9, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX6-NEXT: v_not_b32_e32 v10, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -5972,29 +5978,30 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX6-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v6 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v11, 0xffff0000, v4 ; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB15_4 Depth 2 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_max_f32_e32 v4, v4, v10 +; GFX6-NEXT: v_max_f32_e32 v4, v4, v11 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 -; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v6, v5, v10 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_mov_b32_e32 v5, v6 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -6006,22 +6013,21 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB15_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v6 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB15_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v6 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index 2ade237eaa6da..8f270f9a466e2 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -2773,13 +2773,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2803,7 +2804,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: @@ -2813,13 +2813,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v4, s4 ; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -2844,7 +2845,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 @@ -3222,13 +3222,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX7-NEXT: s_and_b32 s4, s20, -4 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_and_b32 s4, s20, 3 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_not_b32 s7, s4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3260,13 +3261,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX6-NEXT: s_and_b32 s4, s20, -4 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s4, s20, 3 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_not_b32 s7, s4 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -3974,11 +3976,12 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_and_b32_e32 v9, -4, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 -; GFX7-NEXT: v_not_b32_e32 v9, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 +; GFX7-NEXT: v_not_b32_e32 v10, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -3989,28 +3992,28 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v6 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v4 ; GFX7-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v8, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 +; GFX7-NEXT: v_and_b32_e32 v6, v5, v10 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_min_f32_e32 v4, v4, v10 +; GFX7-NEXT: v_min_f32_e32 v4, v4, v11 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 -; GFX7-NEXT: v_mov_b32_e32 v5, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -4022,33 +4025,33 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB12_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v6 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB12_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v8, v6 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX6-NEXT: v_and_b32_e32 v9, -4, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 -; GFX6-NEXT: v_not_b32_e32 v9, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 +; GFX6-NEXT: v_not_b32_e32 v10, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -4059,28 +4062,29 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX6-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v6 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v4 ; GFX6-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB12_4 Depth 2 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v6, v5, v10 ; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_min_f32_e32 v4, v4, v10 +; GFX6-NEXT: v_min_f32_e32 v4, v4, v11 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 -; GFX6-NEXT: v_mov_b32_e32 v5, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -4092,22 +4096,21 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB12_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v6 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB12_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v6 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 @@ -4560,6 +4563,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -4590,7 +4594,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4601,6 +4604,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: v_mov_b32_e32 v4, s4 ; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -4632,7 +4636,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 @@ -5072,6 +5075,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX7-NEXT: s_and_b32 s4, s20, 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -5111,6 +5115,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen ; GFX6-NEXT: s_and_b32 s4, s20, 3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -5886,11 +5891,12 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX7-NEXT: v_and_b32_e32 v9, -4, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 -; GFX7-NEXT: v_not_b32_e32 v9, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX7-NEXT: v_not_b32_e32 v10, v4 ; GFX7-NEXT: s_mov_b64 s[6:7], exec ; GFX7-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -5901,29 +5907,29 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB15_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v6 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v4 ; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, v8, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_min_f32_e32 v4, v4, v10 +; GFX7-NEXT: v_min_f32_e32 v4, v4, v11 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX7-NEXT: v_and_b32_e32 v6, v5, v10 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec -; GFX7-NEXT: v_mov_b32_e32 v5, v6 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0 @@ -5935,33 +5941,33 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB15_4 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX7-NEXT: s_mov_b64 exec, s[12:13] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v6 ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX7-NEXT: s_cbranch_execnz .LBB15_3 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v8, v6 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 -; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 +; GFX6-NEXT: v_and_b32_e32 v9, -4, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 -; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 -; GFX6-NEXT: v_not_b32_e32 v9, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v8, 3, v4 +; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX6-NEXT: v_not_b32_e32 v10, v4 ; GFX6-NEXT: s_mov_b64 s[6:7], exec ; GFX6-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -5972,29 +5978,30 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen +; GFX6-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB15_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v6 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v11, 0xffff0000, v4 ; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB15_4 Depth 2 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_min_f32_e32 v4, v4, v10 +; GFX6-NEXT: v_min_f32_e32 v4, v4, v11 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 -; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 -; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v6, v5, v10 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec -; GFX6-NEXT: v_mov_b32_e32 v5, v6 +; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0 @@ -6006,22 +6013,21 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc +; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB15_4 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX6-NEXT: s_mov_b64 exec, s[12:13] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v6 ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_cbranch_execnz .LBB15_3 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v6 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index f67e5b86497ba..d3881660bb846 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -1974,7 +1974,7 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { ; CI-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: v_mov_b32_e32 v0, 4.0 +; CI-NEXT: v_mov_b32_e32 v0, 0x4400 ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index 30dc25388767d..689f9d7d59550 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -62,9 +62,9 @@ entry: define amdgpu_ps half @ps_ret_cc_f16(half %arg0) { ; SI-LABEL: ps_ret_cc_f16: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: ps_ret_cc_f16: @@ -102,9 +102,9 @@ define amdgpu_ps half @ps_ret_cc_f16(half %arg0) { define amdgpu_ps half @ps_ret_cc_inreg_f16(half inreg %arg0) { ; SI-LABEL: ps_ret_cc_inreg_f16: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: ps_ret_cc_inreg_f16: @@ -420,9 +420,9 @@ define amdgpu_kernel void @call_fastcc() #0 { define amdgpu_cs half @cs_mesa(half %arg0) { ; SI-LABEL: cs_mesa: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: cs_mesa: @@ -461,9 +461,9 @@ define amdgpu_cs half @cs_mesa(half %arg0) { define amdgpu_ps half @ps_mesa_f16(half %arg0) { ; SI-LABEL: ps_mesa_f16: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: ps_mesa_f16: @@ -502,9 +502,9 @@ define amdgpu_ps half @ps_mesa_f16(half %arg0) { define amdgpu_vs half @vs_mesa(half %arg0) { ; SI-LABEL: vs_mesa: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: vs_mesa: @@ -543,9 +543,9 @@ define amdgpu_vs half @vs_mesa(half %arg0) { define amdgpu_gs half @gs_mesa(half %arg0) { ; SI-LABEL: gs_mesa: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: gs_mesa: @@ -584,9 +584,9 @@ define amdgpu_gs half @gs_mesa(half %arg0) { define amdgpu_hs half @hs_mesa(half %arg0) { ; SI-LABEL: hs_mesa: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: hs_mesa: @@ -635,7 +635,6 @@ define amdgpu_ps <2 x half> @ps_mesa_v2f16(<2 x half> %arg0) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_readfirstlane_b32 s0, v0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: ps_mesa_v2f16: @@ -673,7 +672,6 @@ define amdgpu_ps <2 x half> @ps_mesa_inreg_v2f16(<2 x half> inreg %arg0) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_readfirstlane_b32 s0, v0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: ps_mesa_inreg_v2f16: @@ -804,8 +802,6 @@ define amdgpu_ps <4 x half> @ps_mesa_v4f16(<4 x half> %arg0) { ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_readfirstlane_b32 s0, v0 -; SI-NEXT: v_readfirstlane_b32 s1, v1 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: ps_mesa_v4f16: @@ -857,8 +853,6 @@ define amdgpu_ps <4 x half> @ps_mesa_inreg_v4f16(<4 x half> inreg %arg0) { ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_readfirstlane_b32 s0, v0 -; SI-NEXT: v_readfirstlane_b32 s1, v1 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: ps_mesa_inreg_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll index 5eb6b2f58474d..711e2f2951fae 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp.ll @@ -4352,8 +4352,8 @@ define half @v_clamp_f16_minimumnum_maximumnum(half %a) #1 { ; GFX6-LABEL: v_clamp_f16_minimumnum_maximumnum: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_clamp_f16_minimumnum_maximumnum: @@ -4408,8 +4408,8 @@ define half @v_clamp_f16_minimumnum_maximumnum_no_ieee(half %a) #5 { ; GFX6-LABEL: v_clamp_f16_minimumnum_maximumnum_no_ieee: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_clamp_f16_minimumnum_maximumnum_no_ieee: @@ -4464,11 +4464,10 @@ define half @v_clamp_f16_minimumnum_maximumnum_foldable_source(half %a, half %b) ; GFX6-LABEL: v_clamp_f16_minimumnum_maximumnum_foldable_source: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_add_f32_e64 v0, v0, v1 clamp +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_clamp_f16_minimumnum_maximumnum_foldable_source: @@ -4524,11 +4523,10 @@ define half @v_clamp_f16_minimumnum_maximumnum_no_ieee_foldable_source(half %a, ; GFX6-LABEL: v_clamp_f16_minimumnum_maximumnum_no_ieee_foldable_source: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_add_f32_e64 v0, v0, v1 clamp +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_clamp_f16_minimumnum_maximumnum_no_ieee_foldable_source: diff --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll index 40efd06c2bdfd..c48efc925ea8b 100644 --- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll @@ -1057,6 +1057,7 @@ define amdgpu_vs <2 x half> @load_v2i16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_and_b32 s0, s0, 0xffff ; GFX67-NEXT: s_lshl_b32 s1, s3, 16 ; GFX67-NEXT: s_or_b32 s0, s0, s1 +; GFX67-NEXT: v_mov_b32_e32 v0, s0 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v2i16: @@ -1114,6 +1115,8 @@ define amdgpu_vs <3 x half> @load_v3i16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_lshl_b32 s2, s5, 16 ; GFX67-NEXT: s_or_b32 s0, s0, s2 ; GFX67-NEXT: s_and_b32 s1, s1, 0xffff +; GFX67-NEXT: v_mov_b32_e32 v0, s0 +; GFX67-NEXT: v_mov_b32_e32 v1, s1 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v3i16: @@ -1180,6 +1183,8 @@ define amdgpu_vs <4 x half> @load_v4i16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_or_b32 s0, s0, s2 ; GFX67-NEXT: s_lshl_b32 s2, s5, 16 ; GFX67-NEXT: s_or_b32 s1, s1, s2 +; GFX67-NEXT: v_mov_b32_e32 v0, s0 +; GFX67-NEXT: v_mov_b32_e32 v1, s1 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v4i16: @@ -1259,6 +1264,9 @@ define amdgpu_vs <6 x half> @load_v6i16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_or_b32 s1, s1, s3 ; GFX67-NEXT: s_lshl_b32 s3, s8, 16 ; GFX67-NEXT: s_or_b32 s2, s2, s3 +; GFX67-NEXT: v_mov_b32_e32 v0, s0 +; GFX67-NEXT: v_mov_b32_e32 v1, s1 +; GFX67-NEXT: v_mov_b32_e32 v2, s2 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v6i16: @@ -1355,6 +1363,10 @@ define amdgpu_vs <8 x half> @load_v8i16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_or_b32 s2, s2, s4 ; GFX67-NEXT: s_lshl_b32 s4, s11, 16 ; GFX67-NEXT: s_or_b32 s3, s3, s4 +; GFX67-NEXT: v_mov_b32_e32 v0, s0 +; GFX67-NEXT: v_mov_b32_e32 v1, s1 +; GFX67-NEXT: v_mov_b32_e32 v2, s2 +; GFX67-NEXT: v_mov_b32_e32 v3, s3 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v8i16: @@ -1489,6 +1501,14 @@ define amdgpu_vs <16 x half> @load_v16i16(ptr addrspace(6) inreg %p0, ptr addrsp ; GFX67-NEXT: s_or_b32 s6, s6, s8 ; GFX67-NEXT: s_lshl_b32 s8, s23, 16 ; GFX67-NEXT: s_or_b32 s7, s7, s8 +; GFX67-NEXT: v_mov_b32_e32 v0, s0 +; GFX67-NEXT: v_mov_b32_e32 v1, s1 +; GFX67-NEXT: v_mov_b32_e32 v2, s2 +; GFX67-NEXT: v_mov_b32_e32 v3, s3 +; GFX67-NEXT: v_mov_b32_e32 v4, s4 +; GFX67-NEXT: v_mov_b32_e32 v5, s5 +; GFX67-NEXT: v_mov_b32_e32 v6, s6 +; GFX67-NEXT: v_mov_b32_e32 v7, s7 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v16i16: @@ -1810,7 +1830,6 @@ define amdgpu_vs <2 x half> @load_v2f16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX67-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX67-NEXT: v_readfirstlane_b32 s0, v0 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v2f16: @@ -1868,14 +1887,12 @@ define amdgpu_vs <3 x half> @load_v3f16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s3 ; GFX67-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX67-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v1 ; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX67-NEXT: v_add_f32_e32 v2, v4, v5 -; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX67-NEXT: v_readfirstlane_b32 s0, v0 -; GFX67-NEXT: v_readfirstlane_b32 s1, v2 +; GFX67-NEXT: v_add_f32_e32 v1, v4, v5 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v3f16: @@ -1933,26 +1950,24 @@ define amdgpu_vs <4 x half> @load_v4f16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s0 ; GFX67-NEXT: s_lshr_b32 s0, s2, 16 ; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s4 -; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s0 -; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s2 -; GFX67-NEXT: s_lshr_b32 s0, s3, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s0 ; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s1 -; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s3 -; GFX67-NEXT: v_add_f32_e32 v1, v1, v4 -; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX67-NEXT: s_lshr_b32 s1, s3, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s0 +; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s1 +; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s2 +; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s3 +; GFX67-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX67-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX67-NEXT: v_add_f32_e32 v0, v0, v6 -; GFX67-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX67-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX67-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX67-NEXT: v_readfirstlane_b32 s0, v0 -; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX67-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX67-NEXT: v_readfirstlane_b32 s1, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX67-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v4f16: @@ -2021,34 +2036,31 @@ define amdgpu_vs <6 x half> @load_v6f16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s0 ; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s4 ; GFX67-NEXT: s_lshr_b32 s0, s5, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v11, s0 -; GFX67-NEXT: v_add_f32_e32 v1, v1, v6 -; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX67-NEXT: v_add_f32_e32 v0, v0, v7 ; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s1 +; GFX67-NEXT: v_add_f32_e32 v1, v1, v6 ; GFX67-NEXT: s_lshr_b32 s1, s6, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v10, s5 -; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX67-NEXT: v_cvt_f32_f16_e32 v10, s0 +; GFX67-NEXT: v_add_f32_e32 v0, v0, v7 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX67-NEXT: v_cvt_f32_f16_e32 v9, s1 -; GFX67-NEXT: v_add_f32_e32 v2, v2, v11 +; GFX67-NEXT: v_cvt_f32_f16_e32 v11, s5 +; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s2 ; GFX67-NEXT: v_cvt_f32_f16_e32 v8, s6 -; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX67-NEXT: v_add_f32_e32 v2, v2, v10 ; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX67-NEXT: v_add_f32_e32 v3, v3, v10 -; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX67-NEXT: v_add_f32_e32 v4, v4, v9 -; GFX67-NEXT: v_readfirstlane_b32 s0, v0 -; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GFX67-NEXT: v_add_f32_e32 v5, v5, v8 -; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v4 -; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v5 +; GFX67-NEXT: v_add_f32_e32 v3, v3, v11 ; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX67-NEXT: v_readfirstlane_b32 s1, v0 -; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX67-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX67-NEXT: v_readfirstlane_b32 s2, v0 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX67-NEXT: v_add_f32_e32 v5, v5, v8 +; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v3 +; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v4 +; GFX67-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX67-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v6f16: @@ -2128,47 +2140,43 @@ define amdgpu_vs <8 x half> @load_v8f16(ptr addrspace(6) inreg %p0, ptr addrspac ; GFX67-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-NEXT: s_lshr_b32 s0, s4, 16 ; GFX67-NEXT: v_cvt_f32_f16_e32 v8, s0 -; GFX67-NEXT: v_cvt_f32_f16_e32 v9, s4 ; GFX67-NEXT: s_lshr_b32 s0, s5, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v9, s4 ; GFX67-NEXT: v_cvt_f32_f16_e32 v10, s0 -; GFX67-NEXT: v_add_f32_e32 v1, v1, v8 ; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s1 ; GFX67-NEXT: v_cvt_f32_f16_e32 v11, s5 -; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX67-NEXT: v_add_f32_e32 v0, v0, v9 ; GFX67-NEXT: s_lshr_b32 s0, s6, 16 -; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX67-NEXT: v_add_f32_e32 v1, v1, v8 +; GFX67-NEXT: s_lshr_b32 s1, s7, 16 ; GFX67-NEXT: v_cvt_f32_f16_e32 v15, s0 ; GFX67-NEXT: v_add_f32_e32 v2, v2, v10 -; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX67-NEXT: v_add_f32_e32 v3, v3, v11 -; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-NEXT: v_add_f32_e32 v0, v0, v9 +; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s2 -; GFX67-NEXT: s_lshr_b32 s1, s7, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v14, s6 -; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v3 ; GFX67-NEXT: v_cvt_f32_f16_e32 v13, s1 -; GFX67-NEXT: v_add_f32_e32 v4, v4, v15 +; GFX67-NEXT: v_cvt_f32_f16_e32 v14, s6 +; GFX67-NEXT: v_add_f32_e32 v3, v3, v11 +; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s3 ; GFX67-NEXT: v_cvt_f32_f16_e32 v12, s7 -; GFX67-NEXT: v_readfirstlane_b32 s0, v0 -; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v4 -; GFX67-NEXT: v_add_f32_e32 v5, v5, v14 -; GFX67-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX67-NEXT: v_add_f32_e32 v4, v4, v15 +; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX67-NEXT: v_add_f32_e32 v6, v6, v13 -; GFX67-NEXT: v_readfirstlane_b32 s1, v0 -; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GFX67-NEXT: v_add_f32_e32 v7, v7, v12 -; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v6 -; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v7 +; GFX67-NEXT: v_add_f32_e32 v5, v5, v14 ; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX67-NEXT: v_readfirstlane_b32 s2, v0 -; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX67-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX67-NEXT: v_readfirstlane_b32 s3, v0 +; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX67-NEXT: v_add_f32_e32 v7, v7, v12 +; GFX67-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v5 +; GFX67-NEXT: v_cvt_f16_f32_e32 v4, v6 +; GFX67-NEXT: v_cvt_f16_f32_e32 v5, v7 +; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX67-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v8f16: @@ -2265,92 +2273,84 @@ define amdgpu_vs <16 x half> @load_v16f16(ptr addrspace(6) inreg %p0, ptr addrsp ; GFX67-NEXT: v_cvt_f32_f16_e32 v12, s0 ; GFX67-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x10 ; GFX67-NEXT: v_cvt_f32_f16_e32 v9, s8 +; GFX67-NEXT: v_cvt_f32_f16_e32 v13, s10 ; GFX67-NEXT: s_lshr_b32 s8, s11, 16 ; GFX67-NEXT: v_cvt_f32_f16_e32 v14, s8 -; GFX67-NEXT: v_cvt_f32_f16_e32 v15, s11 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: s_lshr_b32 s8, s0, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s0 -; GFX67-NEXT: s_lshr_b32 s0, s7, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s0 ; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s7 -; GFX67-NEXT: s_lshr_b32 s0, s6, 16 -; GFX67-NEXT: v_cvt_f32_f16_e32 v13, s10 -; GFX67-NEXT: v_add_f32_e32 v14, v14, v19 -; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s0 -; GFX67-NEXT: v_add_f32_e32 v15, v15, v17 -; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s6 -; GFX67-NEXT: s_lshr_b32 s0, s5, 16 -; GFX67-NEXT: v_add_f32_e32 v12, v12, v19 -; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s0 +; GFX67-NEXT: s_lshr_b32 s7, s7, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s6 +; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s7 +; GFX67-NEXT: v_cvt_f32_f16_e32 v15, s11 +; GFX67-NEXT: s_lshr_b32 s12, s5, 16 +; GFX67-NEXT: v_add_f32_e32 v13, v13, v19 +; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s12 ; GFX67-NEXT: v_cvt_f32_f16_e32 v11, s9 -; GFX67-NEXT: v_add_f32_e32 v13, v13, v17 -; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s5 -; GFX67-NEXT: s_lshr_b32 s0, s4, 16 +; GFX67-NEXT: s_lshr_b32 s13, s6, 16 +; GFX67-NEXT: v_add_f32_e32 v14, v14, v18 +; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s5 +; GFX67-NEXT: v_add_f32_e32 v15, v15, v17 +; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s13 +; GFX67-NEXT: s_lshr_b32 s11, s4, 16 ; GFX67-NEXT: v_add_f32_e32 v10, v10, v19 -; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s0 -; GFX67-NEXT: v_add_f32_e32 v11, v11, v17 +; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s3 +; GFX67-NEXT: v_add_f32_e32 v11, v11, v18 +; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s11 +; GFX67-NEXT: v_add_f32_e32 v12, v12, v17 ; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s4 -; GFX67-NEXT: s_lshr_b32 s0, s3, 16 -; GFX67-NEXT: v_add_f32_e32 v8, v8, v19 -; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s0 +; GFX67-NEXT: s_lshr_b32 s9, s2, 16 +; GFX67-NEXT: v_add_f32_e32 v7, v7, v19 +; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s9 +; GFX67-NEXT: s_lshr_b32 s10, s3, 16 +; GFX67-NEXT: v_add_f32_e32 v8, v8, v18 +; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s2 ; GFX67-NEXT: v_add_f32_e32 v9, v9, v17 -; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s3 -; GFX67-NEXT: s_lshr_b32 s0, s2, 16 -; GFX67-NEXT: v_add_f32_e32 v6, v6, v19 -; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s0 -; GFX67-NEXT: v_cvt_f32_f16_e32 v16, s8 -; GFX67-NEXT: v_add_f32_e32 v7, v7, v17 -; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s2 +; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s10 +; GFX67-NEXT: s_lshr_b32 s8, s0, 16 +; GFX67-NEXT: v_cvt_f32_f16_e32 v16, s0 ; GFX67-NEXT: s_lshr_b32 s0, s1, 16 ; GFX67-NEXT: v_add_f32_e32 v4, v4, v19 -; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s0 -; GFX67-NEXT: v_add_f32_e32 v1, v1, v16 -; GFX67-NEXT: v_add_f32_e32 v5, v5, v17 +; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s8 +; GFX67-NEXT: v_add_f32_e32 v5, v5, v18 +; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s0 +; GFX67-NEXT: v_add_f32_e32 v6, v6, v17 ; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s1 +; GFX67-NEXT: v_add_f32_e32 v1, v1, v19 +; GFX67-NEXT: v_add_f32_e32 v2, v2, v18 +; GFX67-NEXT: v_add_f32_e32 v0, v0, v16 ; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX67-NEXT: v_add_f32_e32 v0, v0, v18 +; GFX67-NEXT: v_add_f32_e32 v3, v3, v17 ; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX67-NEXT: v_add_f32_e32 v2, v2, v19 ; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX67-NEXT: v_add_f32_e32 v3, v3, v17 +; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GFX67-NEXT: v_readfirstlane_b32 s0, v0 -; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v4 -; GFX67-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v5 -; GFX67-NEXT: v_readfirstlane_b32 s1, v0 -; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v6 -; GFX67-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v7 -; GFX67-NEXT: v_readfirstlane_b32 s2, v0 -; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v8 -; GFX67-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v9 -; GFX67-NEXT: v_readfirstlane_b32 s3, v0 -; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v10 -; GFX67-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v11 -; GFX67-NEXT: v_readfirstlane_b32 s4, v0 -; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v12 -; GFX67-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX67-NEXT: v_readfirstlane_b32 s5, v0 -; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v13 ; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v14 -; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v15 -; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX67-NEXT: v_readfirstlane_b32 s6, v0 -; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX67-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX67-NEXT: v_readfirstlane_b32 s7, v0 +; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GFX67-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v5 +; GFX67-NEXT: v_cvt_f16_f32_e32 v4, v6 +; GFX67-NEXT: v_cvt_f16_f32_e32 v5, v7 +; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX67-NEXT: v_cvt_f16_f32_e32 v4, v8 +; GFX67-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX67-NEXT: v_cvt_f16_f32_e32 v5, v9 +; GFX67-NEXT: v_cvt_f16_f32_e32 v6, v10 +; GFX67-NEXT: v_cvt_f16_f32_e32 v7, v11 +; GFX67-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX67-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX67-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX67-NEXT: v_cvt_f16_f32_e32 v6, v12 +; GFX67-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX67-NEXT: v_cvt_f16_f32_e32 v7, v13 +; GFX67-NEXT: v_cvt_f16_f32_e32 v8, v14 +; GFX67-NEXT: v_cvt_f16_f32_e32 v9, v15 +; GFX67-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX67-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX67-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; GFX67-NEXT: v_or_b32_e32 v7, v9, v7 ; GFX67-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_v16f16: diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index f3cb0e140b535..9ad10481c814a 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -474,20 +474,12 @@ define <4 x float> @v_uitofp_unpack_i32_to_v4f32(i32 %arg0) nounwind { } define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind { -; SI-LABEL: v_uitofp_i32_to_f16_mask255: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_uitofp_i32_to_f16_mask255: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_uitofp_i32_to_f16_mask255: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_i32_to_f16_mask255: ; GFX10: ; %bb.0: @@ -524,20 +516,12 @@ define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind { } define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind { -; SI-LABEL: v_sitofp_i32_to_f16_mask255: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_sitofp_i32_to_f16_mask255: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_sitofp_i32_to_f16_mask255: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_i32_to_f16_mask255: ; GFX10: ; %bb.0: @@ -574,20 +558,12 @@ define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind { } define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind { -; SI-LABEL: v_uitofp_to_f16_lshr8_mask255: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_uitofp_to_f16_lshr8_mask255: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_uitofp_to_f16_lshr8_mask255: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_to_f16_lshr8_mask255: ; GFX10: ; %bb.0: @@ -625,20 +601,12 @@ define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind { } define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind { -; SI-LABEL: v_uitofp_to_f16_lshr16_mask255: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_uitofp_to_f16_lshr16_mask255: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 -; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_uitofp_to_f16_lshr16_mask255: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_to_f16_lshr16_mask255: ; GFX10: ; %bb.0: @@ -676,20 +644,12 @@ define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind { } define half @v_uitofp_to_f16_lshr24_mask255(i32 %arg0) nounwind { -; SI-LABEL: v_uitofp_to_f16_lshr24_mask255: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_uitofp_to_f16_lshr24_mask255: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 -; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_uitofp_to_f16_lshr24_mask255: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_to_f16_lshr24_mask255: ; GFX10: ; %bb.0: @@ -732,7 +692,6 @@ define half @v_uitofp_i8_to_f16(i8 %arg0) nounwind { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_uitofp_i8_to_f16: diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll index 777b703d5319d..21abcbd4f5edc 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll @@ -1539,25 +1539,15 @@ define double @fmul_select_f64_test15_sel_log2val_neg42_neg33(double %x, i32 %bo define half @fmul_select_f16_test1(half %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-SDAG-LABEL: fmul_select_f16_test1: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmul_select_f16_test1: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmul_select_f16_test1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: fmul_select_f16_test1: ; GFX9-SDAG: ; %bb.0: @@ -1642,25 +1632,15 @@ define half @fmul_select_f16_test1(half %x, i32 %bool.arg1, i32 %bool.arg2) { } define half @fmul_select_f16_test2(half %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-SDAG-LABEL: fmul_select_f16_test2: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmul_select_f16_test2: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmul_select_f16_test2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: fmul_select_f16_test2: ; GFX9-SDAG: ; %bb.0: @@ -2078,25 +2058,15 @@ define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1, } define half @fmul_select_f16_test5(half %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-SDAG-LABEL: fmul_select_f16_test5: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmul_select_f16_test5: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc -; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmul_select_f16_test5: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_f16_test5: ; GFX9: ; %bb.0: @@ -2159,13 +2129,13 @@ define half @fmul_select_f16_test6(half %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX7-SDAG-LABEL: fmul_select_f16_test6: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x40400000 ; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0xc1000000 ; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: fmul_select_f16_test6: @@ -2268,12 +2238,12 @@ define half @fmul_select_f16_test7(half %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX7-SDAG-LABEL: fmul_select_f16_test7: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x41000000 ; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, -4.0, v3, vcc -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: fmul_select_f16_test7: @@ -2376,12 +2346,12 @@ define half @fmul_select_f16_test8(half %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX7-SDAG-LABEL: fmul_select_f16_test8: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: fmul_select_f16_test8: @@ -2458,11 +2428,12 @@ define half @fmul_select_f16_test9(half %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX7-SDAG-LABEL: fmul_select_f16_test9: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 5, 4, vcc -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: fmul_select_f16_test9: @@ -2565,25 +2536,15 @@ define half @fmul_select_f16_test9(half %x, i32 %bool.arg1, i32 %bool.arg2) { } define half @fmul_select_f16_test10_sel_log2val_neg11_pos11(half %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc -; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11: ; GFX9-SDAG: ; %bb.0: @@ -2668,25 +2629,15 @@ define half @fmul_select_f16_test10_sel_log2val_neg11_pos11(half %x, i32 %bool.a } define half @fmul_select_f16_test11_sel_log2val_pos7_neg14(half %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc -; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14: ; GFX9-SDAG: ; %bb.0: @@ -2774,12 +2725,11 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX7-LABEL: fmul_select_bf16_test1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_bf16_test1: @@ -2908,12 +2858,11 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX7-LABEL: fmul_select_bf16_test2: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_bf16_test2: @@ -3448,13 +3397,12 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX7-LABEL: fmul_select_bf16_test5: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, 0x41000000 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v3, 2.0, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_bf16_test5: @@ -3583,14 +3531,13 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX7-LABEL: fmul_select_bf16_test6: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, 0x40400000 ; GFX7-NEXT: v_mov_b32_e32 v4, 0xc1000000 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_bf16_test6: @@ -3719,13 +3666,12 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX7-LABEL: fmul_select_bf16_test7: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, 0x41000000 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e32 v1, -4.0, v3, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_bf16_test7: @@ -3854,13 +3800,12 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX7-LABEL: fmul_select_bf16_test8: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_bf16_test8: @@ -3984,14 +3929,13 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX7-LABEL: fmul_select_bf16_test9: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, 0xc2000000 ; GFX7-NEXT: v_mov_b32_e32 v4, 0xc1800000 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_bf16_test9: @@ -4120,14 +4064,13 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b ; GFX7-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, 0xdb800000 ; GFX7-NEXT: v_bfrev_b32_e32 v4, 7 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: @@ -4256,14 +4199,13 @@ define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %b ; GFX7-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_bfrev_b32_e32 v3, 50 ; GFX7-NEXT: v_mov_b32_e32 v4, 0x34800000 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll index 8532a7f716ba7..d9b23d43d593d 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -246,7 +246,7 @@ define float @divergent_vec_f16_0(half %a) { ; GCN-LABEL: divergent_vec_f16_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: divergent_vec_f16_0: @@ -654,8 +654,7 @@ define float @divergent_vec_f16_LL(half %a, half %b) { ; GCN-LABEL: divergent_vec_f16_LL: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 65e2b26a79fbd..a723a67498d05 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -215,8 +215,6 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 ; CI-LABEL: v_test_canonicalize_build_vector_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -2572,11 +2570,12 @@ define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 { ; CI-LABEL: v_test_canonicalize_reg_undef_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, 0x7fc00000 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2615,9 +2614,8 @@ define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 { ; CI-LABEL: v_test_canonicalize_undef_reg_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, 0x7fc00000 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, 0x7fc00000 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_or_b32_e32 v0, v1, v0 @@ -2775,11 +2773,10 @@ define <2 x half> @v_test_canonicalize_reg_k_v2f16(half %val) #1 { ; CI-LABEL: v_test_canonicalize_reg_k_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, 2.0 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v1, 2.0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2821,9 +2818,8 @@ define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 { ; CI-LABEL: v_test_canonicalize_k_reg_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, 2.0 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, 2.0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_or_b32_e32 v0, v1, v0 @@ -2915,12 +2911,11 @@ define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1 ; CI-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, 0x7fc00000 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, 0x7fc00000 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; CI-NEXT: v_or_b32_e32 v1, v1, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_or_b32_e32 v0, v0, v2 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2967,11 +2962,9 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal ; CI-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v2, 0x7fc00000 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v2, 0x7fc00000 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 @@ -3027,17 +3020,14 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half ; CI-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v3, 0x7fc00000 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v3, 0x7fc00000 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; CI-NEXT: v_or_b32_e32 v0, v0, v3 ; CI-NEXT: v_or_b32_e32 v1, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 50066711f2552..a8703d5d6e51d 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -13,23 +13,15 @@ define bfloat @v_copysign_bf16_bf16(bfloat %mag, bfloat %sign) { ; GCN-LABEL: v_copysign_bf16_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_movk_i32 s4, 0x7fff +; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_bf16_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_movk_i32 s4, 0x7fff +; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_bf16_bf16: @@ -65,23 +57,17 @@ define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) { ; GCN-LABEL: v_copysign_bf16_s_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: s_and_b32 s4, s16, 0x80000000 -; GCN-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_or_b32_e32 v0, s4, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_movk_i32 s4, 0x7fff +; GCN-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_bf16_s_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: s_and_b32 s4, s16, 0x80000000 -; GFX7-NEXT: s_lshr_b32 s4, s4, 16 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_movk_i32 s4, 0x7fff +; GFX7-NEXT: v_mov_b32_e32 v1, s16 +; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_bf16_s_bf16: @@ -127,23 +113,17 @@ define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) { ; GCN-LABEL: v_copysign_s_bf16_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s16 -; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_movk_i32 s4, 0x7fff +; GCN-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NEXT: v_bfi_b32 v0, s4, v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_s_bf16_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s16 -; GFX7-NEXT: v_and_b32_e32 v0, 0x80000000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_movk_i32 s4, 0x7fff +; GFX7-NEXT: v_mov_b32_e32 v1, s16 +; GFX7-NEXT: v_bfi_b32 v0, s4, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_s_bf16_bf16: @@ -189,23 +169,19 @@ define bfloat @v_copysign_bf16_f32(bfloat %mag, float %sign.f32) { ; GCN-LABEL: v_copysign_bf16_f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_bf16_f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_bf16_f32: @@ -247,23 +223,19 @@ define bfloat @v_copysign_bf16_f64(bfloat %mag, double %sign.f64) { ; GCN-LABEL: v_copysign_bf16_f64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v2 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_bf16_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_bf16_f64: @@ -305,23 +277,15 @@ define bfloat @v_copysign_bf16_f16(bfloat %mag, half %sign.f16) { ; GCN-LABEL: v_copysign_bf16_f16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 -; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_movk_i32 s4, 0x7fff +; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_bf16_f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_movk_i32 s4, 0x7fff +; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_bf16_f16: @@ -357,22 +321,16 @@ define bfloat @v_copysign_bf16_f16(bfloat %mag, half %sign.f16) { define amdgpu_ps i32 @s_copysign_bf16_bf16(bfloat inreg %mag, bfloat inreg %sign) { ; GCN-LABEL: s_copysign_bf16_bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 -; GCN-NEXT: s_and_b32 s0, s1, 0x80000000 -; GCN-NEXT: s_lshr_b32 s0, s0, 16 -; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_or_b32_e32 v0, s0, v0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_and_b32 s1, s1, 0x8000 +; GCN-NEXT: s_and_b32 s0, s0, 0x7fff +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_bf16_bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 -; GFX7-NEXT: s_and_b32 s0, s1, 0x80000000 -; GFX7-NEXT: s_lshr_b32 s0, s0, 16 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: s_and_b32 s1, s1, 0x8000 +; GFX7-NEXT: s_and_b32 s0, s0, 0x7fff +; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_copysign_bf16_bf16: @@ -433,22 +391,18 @@ define amdgpu_ps i32 @s_copysign_bf16_bf16(bfloat inreg %mag, bfloat inreg %sign define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f32) { ; GCN-LABEL: s_copysign_bf16_f32: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 -; GCN-NEXT: s_and_b32 s0, s1, 0x80000000 -; GCN-NEXT: s_lshr_b32 s0, s0, 16 -; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_or_b32_e32 v0, s0, v0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_and_b32 s0, s0, 0x7fff +; GCN-NEXT: s_and_b32 s1, s1, 0x80000000 +; GCN-NEXT: s_lshr_b32 s1, s1, 16 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_bf16_f32: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 -; GFX7-NEXT: s_and_b32 s0, s1, 0x80000000 -; GFX7-NEXT: s_lshr_b32 s0, s0, 16 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: s_and_b32 s1, s1, 0x80000000 +; GFX7-NEXT: s_and_b32 s0, s0, 0x7fff +; GFX7-NEXT: s_lshr_b32 s1, s1, 16 +; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_copysign_bf16_f32: @@ -510,22 +464,18 @@ define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign.f64) { ; GCN-LABEL: s_copysign_bf16_f64: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 -; GCN-NEXT: s_and_b32 s0, s2, 0x80000000 -; GCN-NEXT: s_lshr_b32 s0, s0, 16 -; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_or_b32_e32 v0, s0, v0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_and_b32 s0, s0, 0x7fff +; GCN-NEXT: s_and_b32 s1, s2, 0x80000000 +; GCN-NEXT: s_lshr_b32 s1, s1, 16 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_bf16_f64: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 -; GFX7-NEXT: s_and_b32 s0, s2, 0x80000000 -; GFX7-NEXT: s_lshr_b32 s0, s0, 16 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: s_and_b32 s1, s2, 0x80000000 +; GFX7-NEXT: s_and_b32 s0, s0, 0x7fff +; GFX7-NEXT: s_lshr_b32 s1, s1, 16 +; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_copysign_bf16_f64: @@ -587,22 +537,16 @@ define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign. define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f16) { ; GCN-LABEL: s_copysign_bf16_f16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, s1 -; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 -; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_and_b32 s1, s1, 0x8000 +; GCN-NEXT: s_and_b32 s0, s0, 0x7fff +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_bf16_f16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s1 -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: s_and_b32 s1, s1, 0x8000 +; GFX7-NEXT: s_and_b32 s0, s0, 0x7fff +; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_copysign_bf16_f16: @@ -667,6 +611,7 @@ define float @v_copysign_f32_bf16(float %mag, bfloat %sign.bf16) { ; GCN-LABEL: v_copysign_f32_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: s_brev_b32 s4, -2 ; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -674,6 +619,7 @@ define float @v_copysign_f32_bf16(float %mag, bfloat %sign.bf16) { ; GFX7-LABEL: v_copysign_f32_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: s_brev_b32 s4, -2 ; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -724,6 +670,7 @@ define float @v_copysign_f32_bf16(float %mag, bfloat %sign.bf16) { define amdgpu_ps i32 @s_copysign_f32_bf16(float inreg %mag, bfloat inreg %sign.bf16) { ; GCN-LABEL: s_copysign_f32_bf16: ; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_brev_b32 s2, -2 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -733,6 +680,7 @@ define amdgpu_ps i32 @s_copysign_f32_bf16(float inreg %mag, bfloat inreg %sign.b ; ; GFX7-LABEL: s_copysign_f32_bf16: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_brev_b32 s2, -2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -793,25 +741,21 @@ define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) { ; GCN-LABEL: v_copysign_f16_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GCN-NEXT: s_brev_b32 s4, -2 ; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_f16_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_brev_b32 s4, -2 ; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_f16_bf16: @@ -847,24 +791,18 @@ define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) { define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf16) { ; GCN-LABEL: s_copysign_f16_bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s1 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, s0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, s0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, s1 ; GCN-NEXT: s_brev_b32 s0, -2 -; GCN-NEXT: v_bfi_b32 v0, s0, v1, v0 +; GCN-NEXT: v_bfi_b32 v0, s0, v0, v1 ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_f16_bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s0 -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, s0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s1 ; GFX7-NEXT: s_brev_b32 s0, -2 ; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -933,6 +871,7 @@ define double @v_copysign_f64_bf16(double %mag, bfloat %sign.bf16) { ; GCN-LABEL: v_copysign_f64_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GCN-NEXT: s_brev_b32 s4, -2 ; GCN-NEXT: v_bfi_b32 v1, s4, v1, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -940,6 +879,7 @@ define double @v_copysign_f64_bf16(double %mag, bfloat %sign.bf16) { ; GFX7-LABEL: v_copysign_f64_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: s_brev_b32 s4, -2 ; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -990,6 +930,7 @@ define double @v_copysign_f64_bf16(double %mag, bfloat %sign.bf16) { define amdgpu_ps <2 x i32> @s_copysign_f64_bf16(double inreg %mag, bfloat inreg %sign.bf16) { ; GCN-LABEL: s_copysign_f64_bf16: ; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s2, s2, 16 ; GCN-NEXT: s_brev_b32 s3, -2 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 @@ -999,6 +940,7 @@ define amdgpu_ps <2 x i32> @s_copysign_f64_bf16(double inreg %mag, bfloat inreg ; ; GFX7-LABEL: s_copysign_f64_bf16: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_brev_b32 s3, -2 ; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 @@ -2716,15 +2658,17 @@ define <32 x bfloat> @v_copysign_v32bf16(<32 x bfloat> %mag, <32 x bfloat> %sign define amdgpu_ps i32 @s_copysign_out_f32_mag_bf16_sign_f32(bfloat inreg %mag, float inreg %sign) { ; GCN-LABEL: s_copysign_out_f32_mag_bf16_sign_f32: ; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s0, s0, 16 ; GCN-NEXT: s_brev_b32 s2, -2 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_bfi_b32 v0, s2, v1, v0 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_out_f32_mag_bf16_sign_f32: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_lshl_b32 s0, s0, 16 ; GFX7-NEXT: s_brev_b32 s2, -2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -2778,16 +2722,18 @@ define amdgpu_ps i32 @s_copysign_out_f32_mag_bf16_sign_f32(bfloat inreg %mag, fl define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_bf16_sign_f64(bfloat inreg %mag, double inreg %sign) { ; GCN-LABEL: s_copysign_out_f64_mag_bf16_sign_f64: ; GCN: ; %bb.0: -; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], s0 -; GCN-NEXT: s_brev_b32 s0, -2 +; GCN-NEXT: s_lshl_b32 s0, s0, 16 +; GCN-NEXT: s_brev_b32 s1, -2 ; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_bfi_b32 v1, s0, v1, v2 +; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], s0 +; GCN-NEXT: v_bfi_b32 v1, s1, v1, v2 ; GCN-NEXT: v_readfirstlane_b32 s1, v1 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_out_f64_mag_bf16_sign_f64: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_lshl_b32 s0, s0, 16 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], s0 ; GFX7-NEXT: s_brev_b32 s0, -2 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 @@ -2846,6 +2792,7 @@ define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_bf16_sign_f64(bfloat inreg %m define amdgpu_ps i32 @s_copysign_out_f32_mag_f32_sign_bf16(float inreg %mag, bfloat inreg %sign) { ; GCN-LABEL: s_copysign_out_f32_mag_f32_sign_bf16: ; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_brev_b32 s2, -2 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -2855,6 +2802,7 @@ define amdgpu_ps i32 @s_copysign_out_f32_mag_f32_sign_bf16(float inreg %mag, bfl ; ; GFX7-LABEL: s_copysign_out_f32_mag_f32_sign_bf16: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_brev_b32 s2, -2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -2911,6 +2859,7 @@ define amdgpu_ps i32 @s_copysign_out_f32_mag_f32_sign_bf16(float inreg %mag, bfl define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_f64_sign_bf16(double inreg %mag, bfloat inreg %sign) { ; GCN-LABEL: s_copysign_out_f64_mag_f64_sign_bf16: ; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s2, s2, 16 ; GCN-NEXT: s_brev_b32 s3, -2 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 @@ -2920,6 +2869,7 @@ define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_f64_sign_bf16(double inreg %m ; ; GFX7-LABEL: s_copysign_out_f64_mag_f64_sign_bf16: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_brev_b32 s3, -2 ; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 @@ -2976,22 +2926,18 @@ define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_f64_sign_bf16(double inreg %m define amdgpu_ps i16 @s_copysign_out_bf16_mag_bf16_sign_f32(bfloat inreg %mag, float inreg %sign) { ; GCN-LABEL: s_copysign_out_bf16_mag_bf16_sign_f32: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 -; GCN-NEXT: s_and_b32 s0, s1, 0x80000000 -; GCN-NEXT: s_lshr_b32 s0, s0, 16 -; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_or_b32_e32 v0, s0, v0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_and_b32 s0, s0, 0x7fff +; GCN-NEXT: s_and_b32 s1, s1, 0x80000000 +; GCN-NEXT: s_lshr_b32 s1, s1, 16 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_out_bf16_mag_bf16_sign_f32: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 -; GFX7-NEXT: s_and_b32 s0, s1, 0x80000000 -; GFX7-NEXT: s_lshr_b32 s0, s0, 16 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: s_and_b32 s1, s1, 0x80000000 +; GFX7-NEXT: s_and_b32 s0, s0, 0x7fff +; GFX7-NEXT: s_lshr_b32 s1, s1, 16 +; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_copysign_out_bf16_mag_bf16_sign_f32: @@ -3044,22 +2990,18 @@ define amdgpu_ps i16 @s_copysign_out_bf16_mag_bf16_sign_f32(bfloat inreg %mag, f define amdgpu_ps i16 @s_copysign_out_bf16_mag_bf16_sign_f64(bfloat inreg %mag, double inreg %sign) { ; GCN-LABEL: s_copysign_out_bf16_mag_bf16_sign_f64: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 -; GCN-NEXT: s_and_b32 s0, s2, 0x80000000 -; GCN-NEXT: s_lshr_b32 s0, s0, 16 -; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_or_b32_e32 v0, s0, v0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_and_b32 s0, s0, 0x7fff +; GCN-NEXT: s_and_b32 s1, s2, 0x80000000 +; GCN-NEXT: s_lshr_b32 s1, s1, 16 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_out_bf16_mag_bf16_sign_f64: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 -; GFX7-NEXT: s_and_b32 s0, s2, 0x80000000 -; GFX7-NEXT: s_lshr_b32 s0, s0, 16 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: s_and_b32 s1, s2, 0x80000000 +; GFX7-NEXT: s_and_b32 s0, s0, 0x7fff +; GFX7-NEXT: s_lshr_b32 s1, s1, 16 +; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_copysign_out_bf16_mag_bf16_sign_f64: @@ -3113,8 +3055,7 @@ define amdgpu_ps i16 @s_copysign_out_bf16_mag_f32_sign_bf16(float inreg %mag, bf ; GCN-LABEL: s_copysign_out_bf16_mag_f32_sign_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 -; GCN-NEXT: s_and_b32 s0, s1, 0x80000000 -; GCN-NEXT: s_lshr_b32 s0, s0, 16 +; GCN-NEXT: s_and_b32 s0, s1, 0xffff8000 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GCN-NEXT: v_or_b32_e32 v0, s0, v0 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 @@ -3123,8 +3064,7 @@ define amdgpu_ps i16 @s_copysign_out_bf16_mag_f32_sign_bf16(float inreg %mag, bf ; GFX7-LABEL: s_copysign_out_bf16_mag_f32_sign_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 -; GFX7-NEXT: s_and_b32 s0, s1, 0x80000000 -; GFX7-NEXT: s_lshr_b32 s0, s0, 16 +; GFX7-NEXT: s_and_b32 s0, s1, 0xffff8000 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 @@ -7282,18 +7222,12 @@ define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4bf16_sign_v4f64(<4 x bfloat> %m define amdgpu_ps i32 @s_copysign_bf16_0_bf16(bfloat inreg %sign) { ; GCN-LABEL: s_copysign_bf16_0_bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_and_b32 s0, s0, 0x8000 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_bf16_0_bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: s_and_b32 s0, s0, 0x8000 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_copysign_bf16_0_bf16: @@ -7325,13 +7259,13 @@ define bfloat @v_copysign_bf16_0_bf16(bfloat %sign) { ; GCN-LABEL: v_copysign_bf16_0_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_bf16_0_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, 0x80000000, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_bf16_0_bf16: @@ -7444,13 +7378,17 @@ define bfloat @v_copysign_bf16_0_f32(float %sign) { ; GCN-LABEL: v_copysign_bf16_0_f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_bf16_0_f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, 0x80000000, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_bf16_0_f32: @@ -7624,13 +7562,17 @@ define bfloat @v_copysign_bf16_0_f64(double %sign) { ; GCN-LABEL: v_copysign_bf16_0_f64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v1 +; GCN-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_bf16_0_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, 0x80000000, v1 +; GFX7-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_bf16_0_f64: diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index 7f38e5bb5bb61..b80204e70851e 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -16,11 +16,10 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_ps i16 @s_copysign_f16(half inreg %mag, half inreg %sign) { ; SI-LABEL: s_copysign_f16: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 ; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v0, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_readfirstlane_b32 s0, v0 ; SI-NEXT: ; return to shader part epilog @@ -67,9 +66,7 @@ define amdgpu_ps i16 @s_copysign_f16(half inreg %mag, half inreg %sign) { define amdgpu_ps i16 @s_test_copysign_f16_0(half inreg %mag) { ; SI-LABEL: s_test_copysign_f16_0: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 -; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_and_b32 s0, s0, 0x7fff ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_test_copysign_f16_0: @@ -94,9 +91,7 @@ define amdgpu_ps i16 @s_test_copysign_f16_0(half inreg %mag) { define amdgpu_ps i16 @s_test_copysign_f16_1(half inreg %mag) { ; SI-LABEL: s_test_copysign_f16_1: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 -; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_and_b32 s0, s0, 0x7fff ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_test_copysign_f16_1: @@ -121,9 +116,7 @@ define amdgpu_ps i16 @s_test_copysign_f16_1(half inreg %mag) { define amdgpu_ps i16 @s_test_copysign_f16_10.0(half inreg %mag) { ; SI-LABEL: s_test_copysign_f16_10.0: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 -; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_and_b32 s0, s0, 0x7fff ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_test_copysign_f16_10.0: @@ -148,9 +141,8 @@ define amdgpu_ps i16 @s_test_copysign_f16_10.0(half inreg %mag) { define amdgpu_ps i16 @s_test_copysign_f16_neg1(half inreg %mag) { ; SI-LABEL: s_test_copysign_f16_neg1: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 -; SI-NEXT: v_or_b32_e32 v0, 0x8000, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_and_b32 s0, s0, 0x7fff +; SI-NEXT: s_bitset1_b32 s0, 15 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_test_copysign_f16_neg1: @@ -175,9 +167,8 @@ define amdgpu_ps i16 @s_test_copysign_f16_neg1(half inreg %mag) { define amdgpu_ps i16 @s_test_copysign_f16_neg10(half inreg %mag) { ; SI-LABEL: s_test_copysign_f16_neg10: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 -; SI-NEXT: v_or_b32_e32 v0, 0x8000, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_and_b32 s0, s0, 0x7fff +; SI-NEXT: s_bitset1_b32 s0, 15 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_test_copysign_f16_neg10: @@ -202,9 +193,7 @@ define amdgpu_ps i16 @s_test_copysign_f16_neg10(half inreg %mag) { define amdgpu_ps i16 @s_test_copysign_f16_0_mag(half inreg %sign) { ; SI-LABEL: s_test_copysign_f16_0_mag: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 -; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_and_b32 s0, s0, 0x8000 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_test_copysign_f16_0_mag: @@ -230,10 +219,8 @@ define amdgpu_ps i16 @s_test_copysign_f16_0_mag(half inreg %sign) { define amdgpu_ps i16 @s_test_copysign_f16_1_mag(half inreg %sign) { ; SI-LABEL: s_test_copysign_f16_1_mag: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 -; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; SI-NEXT: v_or_b32_e32 v0, 0x3c00, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_and_b32 s0, s0, 0xffff8000 +; SI-NEXT: s_or_b32 s0, s0, 0x3c00 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_test_copysign_f16_1_mag: @@ -263,9 +250,9 @@ define i16 @s_test_copysign_f16_10_mag(half inreg %sign) { ; SI-LABEL: s_test_copysign_f16_10_mag: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; SI-NEXT: v_or_b32_e32 v0, 0x4900, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff8000 +; SI-NEXT: s_or_b32 s4, s4, 0x4900 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: s_test_copysign_f16_10_mag: @@ -300,10 +287,8 @@ define i16 @s_test_copysign_f16_10_mag(half inreg %sign) { define amdgpu_ps i16 @s_test_copysign_f16_neg1_mag(half inreg %sign) { ; SI-LABEL: s_test_copysign_f16_neg1_mag: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 -; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; SI-NEXT: v_or_b32_e32 v0, 0x3c00, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_and_b32 s0, s0, 0xffff8000 +; SI-NEXT: s_or_b32 s0, s0, 0x3c00 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_test_copysign_f16_neg1_mag: @@ -332,10 +317,8 @@ define amdgpu_ps i16 @s_test_copysign_f16_neg1_mag(half inreg %sign) { define amdgpu_ps i16 @s_test_copysign_f16_neg10_mag(half inreg %sign) { ; SI-LABEL: s_test_copysign_f16_neg10_mag: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 -; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; SI-NEXT: v_or_b32_e32 v0, 0x4900, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_and_b32 s0, s0, 0xffff8000 +; SI-NEXT: s_or_b32 s0, s0, 0x4900 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_test_copysign_f16_neg10_mag: @@ -365,10 +348,11 @@ define half @v_copysign_f16(half %mag, half %sign) { ; SI-LABEL: v_copysign_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_f16: @@ -398,8 +382,7 @@ define half @v_test_copysign_f16_0(half %mag) { ; SI-LABEL: v_test_copysign_f16_0: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_test_copysign_f16_0: @@ -433,8 +416,7 @@ define half @v_test_copysign_f16_1(half %mag) { ; SI-LABEL: v_test_copysign_f16_1: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_test_copysign_f16_1: @@ -468,8 +450,7 @@ define half @v_test_copysign_f16_10(half %mag) { ; SI-LABEL: v_test_copysign_f16_10: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_test_copysign_f16_10: @@ -503,8 +484,8 @@ define half @v_test_copysign_f16_neg1(half %mag) { ; SI-LABEL: v_test_copysign_f16_neg1: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; SI-NEXT: v_or_b32_e32 v0, 0x8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_test_copysign_f16_neg1: @@ -538,8 +519,8 @@ define half @v_test_copysign_f16_neg10(half %mag) { ; SI-LABEL: v_test_copysign_f16_neg10: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; SI-NEXT: v_or_b32_e32 v0, 0x8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_test_copysign_f16_neg10: @@ -573,6 +554,7 @@ define float @v_copysign_out_f32_mag_f16_sign_f32(half %mag, float %sign) { ; SI-LABEL: v_copysign_out_f32_mag_f16_sign_f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -617,8 +599,9 @@ define double @v_copysign_out_f64_mag_f16_sign_f64(half %mag, double %sign) { ; SI-LABEL: v_copysign_out_f64_mag_f16_sign_f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; SI-NEXT: v_bfi_b32 v1, s4, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -666,6 +649,7 @@ define float @v_copysign_out_f32_mag_f32_sign_f16(float %mag, half %sign) { ; SI-LABEL: v_copysign_out_f32_mag_f32_sign_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -710,6 +694,7 @@ define double @v_copysign_out_f64_mag_f64_sign_f16(double %mag, half %sign) { ; SI-LABEL: v_copysign_out_f64_mag_f64_sign_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: v_bfi_b32 v1, s4, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -754,10 +739,10 @@ define half @v_copysign_out_f16_mag_f16_sign_f32(half %mag, float %sign) { ; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f32: @@ -792,10 +777,10 @@ define half @v_copysign_out_f16_mag_f16_sign_f64(half %mag, double %sign) { ; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f64: @@ -831,9 +816,11 @@ define half @v_copysign_out_f16_mag_f32_sign_f16(float %mag, half %sign) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_f16_mag_f32_sign_f16: @@ -883,8 +870,9 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) { ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: v_and_b32_e32 v3, 0xffe, v3 ; SI-NEXT: v_bfe_u32 v4, v1, 20, 11 +; SI-NEXT: s_movk_i32 s4, 0x3f1 ; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: v_sub_i32_e32 v5, vcc, 0x3f1, v4 +; SI-NEXT: v_sub_i32_e32 v5, vcc, s4, v4 ; SI-NEXT: v_or_b32_e32 v3, 0x1000, v0 ; SI-NEXT: v_med3_i32 v5, v5, 0, 13 ; SI-NEXT: v_lshrrev_b32_e32 v6, v5, v3 @@ -918,9 +906,11 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) { ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_f16_mag_f64_sign_f16: @@ -1115,51 +1105,51 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) { define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, half inreg %sign) { ; SI-LABEL: s_copysign_out_f16_mag_f64_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_and_b32 s3, s1, 0x1ff -; SI-NEXT: s_or_b32 s0, s3, s0 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: s_and_b32 s2, s1, 0x1ff +; SI-NEXT: s_or_b32 s0, s2, s0 +; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] ; SI-NEXT: s_lshr_b32 s0, s1, 8 -; SI-NEXT: s_bfe_u32 s4, s1, 0xb0014 +; SI-NEXT: s_bfe_u32 s3, s1, 0xb0014 ; SI-NEXT: s_and_b32 s0, s0, 0xffe -; SI-NEXT: v_readfirstlane_b32 s3, v0 -; SI-NEXT: s_sub_i32 s5, 0x3f1, s4 -; SI-NEXT: s_or_b32 s0, s0, s3 -; SI-NEXT: v_med3_i32 v0, s5, 0, 13 -; SI-NEXT: s_or_b32 s3, s0, 0x1000 -; SI-NEXT: v_readfirstlane_b32 s5, v0 -; SI-NEXT: s_lshr_b32 s6, s3, s5 -; SI-NEXT: s_lshl_b32 s5, s6, s5 -; SI-NEXT: s_cmp_lg_u32 s5, s3 -; SI-NEXT: s_cselect_b32 s3, 1, 0 -; SI-NEXT: s_addk_i32 s4, 0xfc10 -; SI-NEXT: s_lshl_b32 s5, s4, 12 -; SI-NEXT: s_or_b32 s3, s6, s3 -; SI-NEXT: s_or_b32 s5, s0, s5 -; SI-NEXT: s_cmp_lt_i32 s4, 1 -; SI-NEXT: s_cselect_b32 s3, s3, s5 -; SI-NEXT: s_and_b32 s5, s3, 7 -; SI-NEXT: s_cmp_gt_i32 s5, 5 -; SI-NEXT: s_cselect_b32 s6, 1, 0 -; SI-NEXT: s_cmp_eq_u32 s5, 3 +; SI-NEXT: v_readfirstlane_b32 s2, v1 +; SI-NEXT: s_sub_i32 s4, 0x3f1, s3 +; SI-NEXT: s_or_b32 s0, s0, s2 +; SI-NEXT: v_med3_i32 v1, s4, 0, 13 +; SI-NEXT: s_or_b32 s2, s0, 0x1000 +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_lshr_b32 s5, s2, s4 +; SI-NEXT: s_lshl_b32 s4, s5, s4 +; SI-NEXT: s_cmp_lg_u32 s4, s2 +; SI-NEXT: s_cselect_b32 s2, 1, 0 +; SI-NEXT: s_addk_i32 s3, 0xfc10 +; SI-NEXT: s_lshl_b32 s4, s3, 12 +; SI-NEXT: s_or_b32 s2, s5, s2 +; SI-NEXT: s_or_b32 s4, s0, s4 +; SI-NEXT: s_cmp_lt_i32 s3, 1 +; SI-NEXT: s_cselect_b32 s2, s2, s4 +; SI-NEXT: s_and_b32 s4, s2, 7 +; SI-NEXT: s_cmp_gt_i32 s4, 5 ; SI-NEXT: s_cselect_b32 s5, 1, 0 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_lshr_b32 s3, s3, 2 -; SI-NEXT: s_add_i32 s3, s3, s5 -; SI-NEXT: s_cmp_lt_i32 s4, 31 -; SI-NEXT: s_cselect_b32 s3, s3, 0x7c00 +; SI-NEXT: s_cmp_eq_u32 s4, 3 +; SI-NEXT: s_cselect_b32 s4, 1, 0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshr_b32 s2, s2, 2 +; SI-NEXT: s_add_i32 s2, s2, s4 +; SI-NEXT: s_cmp_lt_i32 s3, 31 +; SI-NEXT: s_cselect_b32 s2, s2, 0x7c00 ; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: s_movk_i32 s0, 0x7e00 ; SI-NEXT: s_cselect_b32 s0, s0, 0x7c00 -; SI-NEXT: s_cmpk_eq_i32 s4, 0x40f -; SI-NEXT: s_cselect_b32 s0, s0, s3 +; SI-NEXT: s_cmpk_eq_i32 s3, 0x40f +; SI-NEXT: s_cselect_b32 s0, s0, s2 ; SI-NEXT: s_lshr_b32 s1, s1, 16 ; SI-NEXT: s_and_b32 s1, s1, 0x8000 ; SI-NEXT: s_or_b32 s0, s1, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 ; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_bfi_b32 v0, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_readfirstlane_b32 s0, v0 ; SI-NEXT: ; return to shader part epilog @@ -2551,10 +2541,10 @@ define <32 x half> @v_copysign_v32f32(<32 x half> %mag, <32 x half> %sign) { define amdgpu_ps i32 @s_copysign_out_f32_mag_f16_sign_f32(half inreg %mag, float inreg %sign) { ; SI-LABEL: s_copysign_out_f32_mag_f16_sign_f32: ; SI: ; %bb.0: -; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_brev_b32 s0, -2 ; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_bfi_b32 v0, s2, v0, v1 +; SI-NEXT: v_bfi_b32 v0, s0, v0, v1 ; SI-NEXT: v_readfirstlane_b32 s0, v0 ; SI-NEXT: ; return to shader part epilog ; @@ -2592,12 +2582,13 @@ define amdgpu_ps i32 @s_copysign_out_f32_mag_f16_sign_f32(half inreg %mag, float define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_f16_sign_f64(half inreg %mag, double inreg %sign) { ; SI-LABEL: s_copysign_out_f64_mag_f16_sign_f64: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], s0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; SI-NEXT: s_brev_b32 s0, -2 ; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; SI-NEXT: v_bfi_b32 v1, s0, v1, v2 -; SI-NEXT: v_readfirstlane_b32 s1, v1 ; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: v_readfirstlane_b32 s1, v1 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_out_f64_mag_f16_sign_f64: @@ -2641,10 +2632,10 @@ define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_f16_sign_f64(half inreg %mag, define amdgpu_ps i32 @s_copysign_out_f32_mag_f32_sign_f16(float inreg %mag, half inreg %sign) { ; SI-LABEL: s_copysign_out_f32_mag_f32_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_bfi_b32 v0, s2, v0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; SI-NEXT: s_brev_b32 s1, -2 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: v_bfi_b32 v0, s1, v1, v0 ; SI-NEXT: v_readfirstlane_b32 s0, v0 ; SI-NEXT: ; return to shader part epilog ; @@ -2690,10 +2681,10 @@ define amdgpu_ps i32 @s_copysign_out_f32_mag_f32_sign_f16(float inreg %mag, half define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_f64_sign_f16(double inreg %mag, half inreg %sign) { ; SI-LABEL: s_copysign_out_f64_mag_f64_sign_f16: ; SI: ; %bb.0: -; SI-NEXT: s_brev_b32 s3, -2 -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_bfi_b32 v0, s3, v0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_bfi_b32 v0, s2, v1, v0 ; SI-NEXT: v_readfirstlane_b32 s1, v0 ; SI-NEXT: ; return to shader part epilog ; @@ -2739,10 +2730,9 @@ define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_f64_sign_f16(double inreg %ma define amdgpu_ps i16 @s_copysign_out_f16_mag_f16_sign_f32(half inreg %mag, float inreg %sign) { ; SI-LABEL: s_copysign_out_f16_mag_f16_sign_f32: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; SI-NEXT: s_brev_b32 s0, -2 ; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_bfi_b32 v0, s0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_readfirstlane_b32 s0, v0 @@ -2791,10 +2781,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f16_sign_f32(half inreg %mag, float define amdgpu_ps i16 @s_copysign_out_f16_mag_f16_sign_f64(half inreg %mag, double inreg %sign) { ; SI-LABEL: s_copysign_out_f16_mag_f16_sign_f64: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; SI-NEXT: s_brev_b32 s0, -2 ; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_bfi_b32 v0, s0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_readfirstlane_b32 s0, v0 @@ -2844,8 +2833,8 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f32_sign_f16(float inreg %mag, half ; SI-LABEL: s_copysign_out_f16_mag_f32_sign_f16: ; SI: ; %bb.0: ; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s1 ; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_bfi_b32 v0, s0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -6900,9 +6889,7 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f16_sign_v4f64(<4 x half> %mag, <4 define amdgpu_ps i32 @s_copysign_f16_0_f16(half inreg %sign) { ; SI-LABEL: s_copysign_f16_0_f16: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 -; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_and_b32 s0, s0, 0x8000 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_f16_0_f16: @@ -6929,7 +6916,7 @@ define half @v_copysign_f16_0_f16(half %sign) { ; SI-LABEL: v_copysign_f16_0_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_f16_0_f16: @@ -7007,7 +6994,8 @@ define half @v_copysign_f16_0_f32(float %sign) { ; SI-LABEL: v_copysign_f16_0_f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_f16_0_f32: @@ -7242,7 +7230,45 @@ define half @v_copysign_f16_0_f64(double %sign) { ; SI-LABEL: v_copysign_f16_0_f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0x1ff, v1 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v2, 0xffe, v2 +; SI-NEXT: v_bfe_u32 v3, v1, 20, 11 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_sub_i32_e32 v4, vcc, 0x3f1, v3 +; SI-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; SI-NEXT: v_med3_i32 v4, v4, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v5, v4, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2 +; SI-NEXT: s_movk_i32 s4, 0xfc10 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 12, v3 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_and_b32_e32 v2, 7, v0 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v2 +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_lshrrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; SI-NEXT: s_movk_i32 s4, 0x40f +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 +; SI-NEXT: v_cmp_ne_u32_e64 s[4:5], s4, v3 +; SI-NEXT: v_mov_b32_e32 v2, 0x7c00 +; SI-NEXT: s_and_b64 vcc, s[4:5], vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_f16_0_f64: diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll index eea2e46f8e390..7b11922e75b4f 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll @@ -1177,17 +1177,12 @@ define float @v_copysign_f32_0_f64(double %sign) { } define amdgpu_ps i32 @s_copysign_f32_0_f16(half inreg %sign) { -; SI-LABEL: s_copysign_f32_0_f16: -; SI: ; %bb.0: -; SI-NEXT: s_and_b32 s0, s0, 0x80000000 -; SI-NEXT: ; return to shader part epilog -; -; VI-LABEL: s_copysign_f32_0_f16: -; VI: ; %bb.0: -; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; VI-NEXT: v_and_b32_e32 v0, 0x80000000, v0 -; VI-NEXT: v_readfirstlane_b32 s0, v0 -; VI-NEXT: ; return to shader part epilog +; SIVI-LABEL: s_copysign_f32_0_f16: +; SIVI: ; %bb.0: +; SIVI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SIVI-NEXT: v_and_b32_e32 v0, 0x80000000, v0 +; SIVI-NEXT: v_readfirstlane_b32 s0, v0 +; SIVI-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_copysign_f32_0_f16: ; GFX11: ; %bb.0: @@ -1206,6 +1201,7 @@ define float @v_copysign_f32_0_f16(half %sign) { ; SI-LABEL: v_copysign_f32_0_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll index b72eb5c5cf588..6c37316d33d03 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll @@ -955,9 +955,10 @@ define double @v_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i32], double ; SI-LABEL: v_test_copysign_f64_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 ; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: v_mov_b32_e32 v0, v10 -; SI-NEXT: v_bfi_b32 v1, s4, v11, v20 +; SI-NEXT: v_bfi_b32 v1, s4, v11, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_test_copysign_f64_f16: @@ -1139,6 +1140,7 @@ define double @v_copysign_f64_0_f32(float %sign) { define amdgpu_ps <2 x i32> @s_copysign_f64_0_f16(half inreg %sign) { ; SI-LABEL: s_copysign_f64_0_f16: ; SI: ; %bb.0: +; SI-NEXT: s_lshl_b32 s0, s0, 16 ; SI-NEXT: s_and_b32 s1, s0, 0x80000000 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: ; return to shader part epilog @@ -1164,22 +1166,14 @@ define amdgpu_ps <2 x i32> @s_copysign_f64_0_f16(half inreg %sign) { } define double @v_copysign_f64_0_f16(half %sign) { -; SI-LABEL: v_copysign_f64_0_f16: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v1 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_copysign_f64_0_f16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; VI-NEXT: v_mov_b32_e32 v0, 0 -; VI-NEXT: v_and_b32_e32 v1, 0x80000000, v1 -; VI-NEXT: s_setpc_b64 s[30:31] +; SIVI-LABEL: v_copysign_f64_0_f16: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SIVI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SIVI-NEXT: v_mov_b32_e32 v0, 0 +; SIVI-NEXT: v_and_b32_e32 v1, 0x80000000, v1 +; SIVI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_copysign_f64_0_f16: ; GFX11: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll index 0bb3b8c6f3740..1779c45203f47 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -1937,9 +1937,6 @@ define half @v_fdiv_f16_arcp(half %x, half %y) { ; SI-LABEL: v_fdiv_f16_arcp: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 @@ -1955,6 +1952,8 @@ define half @v_fdiv_f16_arcp(half %x, half %y) { ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; SI-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_f16_arcp: @@ -2001,12 +2000,11 @@ define half @v_fdiv_f16_afn_nsz(half %x, half %y) { ; SI-LABEL: v_fdiv_f16_afn_nsz: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_rcp_f32_e32 v1, v1 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_f16_afn_nsz: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index e32842f8d6f57..dcf0519dee355 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -8690,10 +8690,11 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 @@ -8718,7 +8719,6 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result @@ -9071,10 +9071,11 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 @@ -9099,7 +9100,6 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -9454,10 +9454,11 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 @@ -9482,7 +9483,6 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 -1024 %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -9814,13 +9814,14 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10182,10 +10183,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 @@ -10552,10 +10554,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 @@ -10830,8 +10833,9 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11108,31 +11112,31 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0 +; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v0, v[3:4] +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_or_b32_e32 v5, v2, v0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB43_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 @@ -11490,10 +11494,11 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 @@ -11518,7 +11523,6 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fadd ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -11865,10 +11869,11 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 @@ -12318,23 +12323,24 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_not_b32_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX7-NEXT: v_and_b32_e32 v7, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12345,8 +12351,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: s_cbranch_execnz .LBB46_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result @@ -12782,23 +12787,24 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_not_b32_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX7-NEXT: v_and_b32_e32 v7, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12809,8 +12815,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_cbranch_execnz .LBB47_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -13248,23 +13253,24 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_not_b32_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX7-NEXT: v_and_b32_e32 v7, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13275,8 +13281,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: s_cbranch_execnz .LBB48_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds bfloat, ptr %ptr, i64 -1024 %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -13701,6 +13706,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -14152,6 +14158,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -14524,31 +14531,31 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0 +; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v0, v[3:4] +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v5, v2, v0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB51_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 @@ -14887,6 +14894,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 @@ -15317,22 +15325,23 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v4, v[0:1] -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_not_b32_e32 v5, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_and_b32_e32 v7, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15783,23 +15792,24 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_not_b32_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX7-NEXT: v_and_b32_e32 v7, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15810,8 +15820,7 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_cbranch_execnz .LBB54_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fadd ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -16240,6 +16249,7 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 2b15147365777..a412a4eebe7ea 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -6350,10 +6350,11 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 @@ -6378,7 +6379,6 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result @@ -6756,10 +6756,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 @@ -6784,7 +6785,6 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7164,10 +7164,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 @@ -7192,7 +7193,6 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 -1024 %result = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7544,13 +7544,14 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7937,10 +7938,11 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 @@ -8332,10 +8334,11 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 @@ -8636,31 +8639,31 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0 +; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v0, v[3:4] +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_or_b32_e32 v5, v2, v0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 @@ -8932,8 +8935,9 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9335,10 +9339,11 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 @@ -9363,7 +9368,6 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fmax ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -9735,10 +9739,11 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 @@ -10188,24 +10193,25 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_not_b32_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX7-NEXT: v_and_b32_e32 v7, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10216,8 +10222,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: s_cbranch_execnz .LBB36_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result @@ -10653,24 +10658,25 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_not_b32_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX7-NEXT: v_and_b32_e32 v7, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10681,8 +10687,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_cbranch_execnz .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -11120,24 +11125,25 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_not_b32_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX7-NEXT: v_and_b32_e32 v7, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11148,8 +11154,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: s_cbranch_execnz .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds bfloat, ptr %ptr, i64 -1024 %result = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -11559,23 +11564,24 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v4, v[0:1] -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_not_b32_e32 v5, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_and_b32_e32 v7, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12010,6 +12016,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -12462,6 +12469,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -12835,32 +12843,32 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX7-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0 +; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v0, v[3:4] +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v5, v2, v0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 @@ -13199,6 +13207,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 @@ -13661,24 +13670,25 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_not_b32_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX7-NEXT: v_and_b32_e32 v7, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13689,8 +13699,7 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_cbranch_execnz .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fmax ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14119,6 +14128,7 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index ad7ee22fdb76e..c05d76a63a1d4 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -6350,10 +6350,11 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 @@ -6378,7 +6379,6 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result @@ -6756,10 +6756,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 @@ -6784,7 +6785,6 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7164,10 +7164,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 @@ -7192,7 +7193,6 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 -1024 %result = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -7544,13 +7544,14 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7937,10 +7938,11 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 @@ -8332,10 +8334,11 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 @@ -8636,31 +8639,31 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0 +; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v0, v[3:4] +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_or_b32_e32 v5, v2, v0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 @@ -8932,8 +8935,9 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9335,10 +9339,11 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 @@ -9363,7 +9368,6 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fmin ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -9735,10 +9739,11 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 @@ -10188,24 +10193,25 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_not_b32_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX7-NEXT: v_and_b32_e32 v7, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10216,8 +10222,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: s_cbranch_execnz .LBB36_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result @@ -10653,24 +10658,25 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_not_b32_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX7-NEXT: v_and_b32_e32 v7, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10681,8 +10687,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_cbranch_execnz .LBB37_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -11120,24 +11125,25 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_not_b32_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX7-NEXT: v_and_b32_e32 v7, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11148,8 +11154,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: s_cbranch_execnz .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds bfloat, ptr %ptr, i64 -1024 %result = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -11559,23 +11564,24 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v4, v[0:1] -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_not_b32_e32 v5, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_and_b32_e32 v7, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12010,6 +12016,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -12462,6 +12469,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -12835,32 +12843,32 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX7-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0 +; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v0, v[3:4] +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v5, v2, v0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 @@ -13199,6 +13207,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 @@ -13661,24 +13670,25 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_not_b32_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX7-NEXT: v_and_b32_e32 v7, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13689,8 +13699,7 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_cbranch_execnz .LBB44_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fmin ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14119,6 +14128,7 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index a278be61104cc..d7c913cafd7d9 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -6137,10 +6137,11 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 @@ -6165,7 +6166,6 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr %ptr, half %val syncscope("agent") seq_cst ret half %result @@ -6518,10 +6518,11 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 @@ -6546,7 +6547,6 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst @@ -6901,10 +6901,11 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 @@ -6929,7 +6930,6 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 -1024 %result = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst @@ -7261,13 +7261,14 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7629,10 +7630,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 @@ -7999,10 +8001,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 @@ -8283,31 +8286,31 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX7-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0 +; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v0, v[3:4] +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_or_b32_e32 v5, v2, v0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst, align 4 @@ -8559,8 +8562,9 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8937,10 +8941,11 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 ; GFX7-NEXT: v_not_b32_e32 v4, v4 @@ -8965,7 +8970,6 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fsub ptr %gep, half %val seq_cst @@ -9312,10 +9316,11 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 @@ -9765,23 +9770,24 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_not_b32_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_sub_f32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX7-NEXT: v_and_b32_e32 v7, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -9792,8 +9798,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX7-NEXT: s_cbranch_execnz .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr %ptr, bfloat %val syncscope("agent") seq_cst ret bfloat %result @@ -10229,23 +10234,24 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_not_b32_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_sub_f32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX7-NEXT: v_and_b32_e32 v7, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10256,8 +10262,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX7-NEXT: s_cbranch_execnz .LBB33_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst @@ -10695,23 +10700,24 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_not_b32_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_sub_f32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX7-NEXT: v_and_b32_e32 v7, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -10722,8 +10728,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX7-NEXT: s_cbranch_execnz .LBB34_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds bfloat, ptr %ptr, i64 -1024 %result = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst @@ -11133,22 +11138,23 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v4, v[0:1] -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_not_b32_e32 v5, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_sub_f32_e32 v3, v3, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_and_b32_e32 v7, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11583,6 +11589,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -12034,6 +12041,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -12406,31 +12414,31 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX7-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v3, v[0:1] -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0 +; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v0, v[3:4] +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v5, v2, v0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4 @@ -12769,6 +12777,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 @@ -13230,23 +13239,24 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_not_b32_e32 v3, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2 +; GFX7-NEXT: v_sub_f32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX7-NEXT: v_and_b32_e32 v7, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13257,8 +13267,7 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX7-NEXT: s_cbranch_execnz .LBB40_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fsub ptr %gep, bfloat %val seq_cst @@ -13687,6 +13696,7 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX7-NEXT: flat_load_dword v3, v[0:1] ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll index e9a6854226e60..7afdf102f5295 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll @@ -1039,25 +1039,21 @@ define half @v_max3_f16_maximumnum_maximumnum__v_v_v_0(half %a, half %b, half %c ; GFX6-LABEL: v_max3_f16_maximumnum_maximumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_max3_f32 v0, v0, v1, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_max3_f16_maximumnum_maximumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_max3_f32 v0, v0, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_max3_f16_maximumnum_maximumnum__v_v_v_0: @@ -1678,31 +1674,31 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b, ; GFX6-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0: diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll index dd2e9896cf882..b187f39c786aa 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll @@ -26,11 +26,10 @@ define half @test_fmax_legacy_ugt_f16(half %a, half %b) #0 { ; SI-LABEL: test_fmax_legacy_ugt_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_max_legacy_f32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_f16: @@ -67,11 +66,10 @@ define half @test_fmax_legacy_ugt_f16_fast(half %a, half %b) #0 { ; SI-LABEL: test_fmax_legacy_ugt_f16_fast: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_f16_fast: diff --git a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll index 99b163dc9753b..c0ff9b5a041ef 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll @@ -20,9 +20,11 @@ define half @fmed3_f32_fpext_f16(half %arg0, half %arg1, half %arg2) #1 { ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16: @@ -74,9 +76,11 @@ define half @fmed3_f32_fpext_f16_flags(half %arg0, half %arg1, half %arg2) #1 { ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_flags: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_flags: @@ -128,10 +132,12 @@ define half @fmed3_f32_fpext_f16_multi_use(half %arg0, half %arg1, half %arg2, p ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_med3_f32 v1, v0, v1, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v1 ; GFX7-SDAG-NEXT: flat_store_dword v[3:4], v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -183,9 +189,10 @@ define half @fmed3_f32_fpext_f16_k0(half %arg1, half %arg2) #1 { ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_k0: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_med3_f32 v0, 2.0, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_k0: @@ -234,9 +241,10 @@ define half @fmed3_f32_fpext_f16_k1(half %arg0, half %arg2) #1 { ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_k1: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, 2.0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_k1: @@ -285,9 +293,10 @@ define half @fmed3_f32_fpext_f16_k2(half %arg0, half %arg1) #1 { ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_k2: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, 2.0 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_k2: @@ -333,23 +342,14 @@ define half @fmed3_f32_fpext_f16_k2(half %arg0, half %arg1) #1 { } define half @fmed3_f32_fpext_f16_k0_k1(half %arg2) #1 { -; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_k0_k1: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_mov_b32_e32 v1, 0x41800000 -; GFX7-SDAG-NEXT: v_med3_f32 v0, 0, v1, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_k0_k1: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, 0x41800000 -; GFX7-GISEL-NEXT: v_med3_f32 v0, 0, v1, v0 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmed3_f32_fpext_f16_k0_k1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0x41800000 +; GFX7-NEXT: v_med3_f32 v0, 0, v1, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_k0_k1: ; GFX8-SDAG: ; %bb.0: @@ -386,21 +386,13 @@ define half @fmed3_f32_fpext_f16_k0_k1(half %arg2) #1 { } define half @fmed3_f32_fpext_f16_k0_k2(half %arg1) #1 { -; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_k0_k2: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_med3_f32 v0, 0, v0, 2.0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_k0_k2: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_med3_f32 v0, 0, v0, 2.0 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmed3_f32_fpext_f16_k0_k2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_med3_f32 v0, 0, v0, 2.0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_k0_k2: ; GFX8-SDAG: ; %bb.0: @@ -437,15 +429,11 @@ define half @fmed3_f32_fpext_f16_fabs(half %arg0, half %arg1, half %arg2) #1 { ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_fabs: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_fabs: @@ -500,9 +488,11 @@ define half @fmed3_fabs_f32_fpext_f16(half %arg0, half %arg1, half %arg2) #1 { ; GFX7-SDAG-LABEL: fmed3_fabs_f32_fpext_f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_med3_f32 v0, |v0|, |v1|, |v2| +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: fmed3_fabs_f32_fpext_f16: @@ -549,9 +539,11 @@ define half @fmed3_f32_fpext_f16_fneg(half %arg0, half %arg1, half %arg2) #1 { ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_fneg: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_med3_f32 v0, -v0, -v1, -v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_fneg: @@ -606,9 +598,11 @@ define half @fmed3_fneg_f32_fpext_f16(half %arg0, half %arg1, half %arg2) #1 { ; GFX7-SDAG-LABEL: fmed3_fneg_f32_fpext_f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_med3_f32 v0, -v0, -v1, -v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: fmed3_fneg_f32_fpext_f16: @@ -655,15 +649,11 @@ define half @fmed3_f32_fpext_f16_fneg_fabs(half %arg0, half %arg1, half %arg2) # ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_fneg_fabs: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; GFX7-SDAG-NEXT: v_med3_f32 v0, -v0, -v1, -v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_fneg_fabs: @@ -721,9 +711,11 @@ define half @fmed3_fneg_fabs_f32_fpext_f16(half %arg0, half %arg1, half %arg2) # ; GFX7-SDAG-LABEL: fmed3_fneg_fabs_f32_fpext_f16: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_med3_f32 v0, -|v0|, -|v1|, -|v2| +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; GFX7-SDAG-NEXT: v_med3_f32 v0, -v0, -v1, -v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: fmed3_fneg_fabs_f32_fpext_f16: @@ -776,8 +768,11 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar ; GFX7-LABEL: fmed3_f32_fpext_f16_fptrunc_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: fmed3_f32_fpext_f16_fptrunc_bf16: @@ -823,11 +818,12 @@ define half @fmed3_f32_fpext_f16_multi_use_0(half %arg0, half %arg1, half %arg2, ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_0: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_med3_f32 v1, v0, v1, v2 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: flat_store_dword v[3:4], v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_mov_b32_e32 v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX7-SDAG-NEXT: v_med3_f32 v0, v5, v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-SDAG-NEXT: flat_store_dword v[3:4], v5 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -891,10 +887,12 @@ define half @fmed3_f32_fpext_f16_multi_use_1(half %arg0, half %arg1, half %arg2, ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_1: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: flat_store_dword v[3:4], v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -958,10 +956,12 @@ define half @fmed3_f32_fpext_f16_multi_use_2(half %arg0, half %arg1, half %arg2, ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_2: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, v2 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: flat_store_dword v[3:4], v2 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1022,33 +1022,15 @@ define half @fmed3_f32_fpext_f16_multi_use_2(half %arg0, half %arg1, half %arg2, } define half @fmed3_f32_fpext_bf16(bfloat %arg0, bfloat %arg1, bfloat %arg2) #1 { -; GFX7-LABEL: fmed3_f32_fpext_bf16: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: fmed3_f32_fpext_bf16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: fmed3_f32_fpext_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_med3_f32 v0, v0, v1, v2 -; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: fmed3_f32_fpext_bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_med3_f32 v0, v0, v1, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] %arg0.ext = fpext bfloat %arg0 to float %arg1.ext = fpext bfloat %arg1 to float %arg2.ext = fpext bfloat %arg2 to float @@ -1061,9 +1043,11 @@ define half @fmed3_f32_fpext_f16_bf16_0(bfloat %arg0, half %arg1, half %arg2) #1 ; GFX7-LABEL: fmed3_f32_fpext_f16_bf16_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: fmed3_f32_fpext_f16_bf16_0: @@ -1097,9 +1081,11 @@ define half @fmed3_f32_fpext_f16_bf16_1(half %arg0, bfloat %arg1, half %arg2) #1 ; GFX7-LABEL: fmed3_f32_fpext_f16_bf16_1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: fmed3_f32_fpext_f16_bf16_1: @@ -1133,9 +1119,11 @@ define half @fmed3_f32_fpext_f16_bf16_2(half %arg0, half %arg1, bfloat %arg2) #1 ; GFX7-LABEL: fmed3_f32_fpext_f16_bf16_2: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: fmed3_f32_fpext_f16_bf16_2: @@ -1169,10 +1157,11 @@ define half @fmed3_f32_fpext_f16_unrepresentable_k0(half %arg1, half %arg2) #1 { ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_unrepresentable_k0: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: s_mov_b32 s4, 0x4f800000 ; GFX7-SDAG-NEXT: v_med3_f32 v0, s4, v0, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_unrepresentable_k0: @@ -1235,10 +1224,11 @@ define half @fmed3_f32_fpext_f16_unrepresentable_k1(half %arg0, half %arg2) #1 { ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_unrepresentable_k1: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: s_mov_b32 s4, 0x4f800000 ; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, s4, v1 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_unrepresentable_k1: @@ -1301,10 +1291,11 @@ define half @fmed3_f32_fpext_f16_unrepresentable_k2(half %arg0, half %arg1) #1 { ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_unrepresentable_k2: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: s_mov_b32 s4, 0x4f800000 ; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, s4 ; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_unrepresentable_k2: @@ -1365,5 +1356,3 @@ define half @fmed3_f32_fpext_f16_unrepresentable_k2(half %arg0, half %arg1) #1 { attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll b/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll index 65ced4f658692..a2de79079feed 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll @@ -9,12 +9,12 @@ define bfloat @v_test_fmed3_r_i_i_bf16_minimumnum_maximumnum(bfloat %a) #1 { ; SI-LABEL: v_test_fmed3_r_i_i_bf16_minimumnum_maximumnum: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_max_f32_e32 v0, 2.0, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-SDAG-LABEL: v_test_fmed3_r_i_i_bf16_minimumnum_maximumnum: diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index d1b1a96fdeffc..b37ab370d0bbf 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -8697,9 +8697,9 @@ define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) { ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f16_minimumnum_maximumnum: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_test_fmed3_r_i_i_f16_minimumnum_maximumnum: @@ -9181,10 +9181,10 @@ define half @v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum(half %a) { ; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-SDAG-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum: diff --git a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll index 6c78f55d2da86..bb6b20df0c149 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll @@ -1039,25 +1039,21 @@ define half @v_min3_f16_minimumnum_minimumnum__v_v_v_0(half %a, half %b, half %c ; GFX6-LABEL: v_min3_f16_minimumnum_minimumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_min3_f32 v0, v0, v1, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_min3_f16_minimumnum_minimumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_min3_f32 v0, v0, v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_min3_f16_minimumnum_minimumnum__v_v_v_0: @@ -1678,31 +1674,31 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b, ; GFX6-LABEL: v_min3_bf16_minimumnum_minimumnum__v_v_v_0: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_min3_bf16_minimumnum_minimumnum__v_v_v_0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_min3_bf16_minimumnum_minimumnum__v_v_v_0: diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll index 9e5a28d6c5041..dd77eb6f364a7 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll @@ -27,11 +27,10 @@ define half @test_fmin_legacy_ule_f16(half %a, half %b) #0 { ; SI-LABEL: test_fmin_legacy_ule_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_min_legacy_f32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_f16: @@ -68,11 +67,10 @@ define half @test_fmin_legacy_ule_f16_fast(half %a, half %b) #0 { ; SI-LABEL: test_fmin_legacy_ule_f16_fast: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_f16_fast: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll index b3f6de638a67d..2079ee54653ce 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll @@ -19,22 +19,20 @@ define half @v_fneg_add_f16(half %a, half %b) #0 { ; SI-SAFE-LABEL: v_fneg_add_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SAFE-NEXT: v_add_f32_e32 v0, v0, v1 -; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_add_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-NSZ-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_add_f16: @@ -84,12 +82,11 @@ define { half, half } @v_fneg_add_store_use_add_f16(half %a, half %b) #0 { ; SI-LABEL: v_fneg_add_store_use_add_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, v0, v1 -; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v1 +; SI-NEXT: v_cvt_f16_f32_e64 v0, -v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_add_store_use_add_f16: @@ -131,24 +128,24 @@ define { half, half } @v_fneg_add_multi_use_add_f16(half %a, half %b) #0 { ; SI-SAFE-LABEL: v_fneg_add_multi_use_add_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_add_f32_e32 v1, v0, v1 -; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v1 -; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v1 +; SI-SAFE-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_add_multi_use_add_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v1 ; SI-NSZ-NEXT: v_mul_f32_e32 v1, -4.0, v0 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_add_multi_use_add_f16: @@ -210,22 +207,19 @@ define half @v_fneg_add_fneg_x_f16(half %a, half %b) #0 { ; SI-SAFE-LABEL: v_fneg_add_fneg_x_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_sub_f32_e32 v0, v1, v0 -; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_add_fneg_x_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_add_fneg_x_f16: @@ -276,22 +270,19 @@ define half @v_fneg_add_x_fneg_f16(half %a, half %b) #0 { ; SI-SAFE-LABEL: v_fneg_add_x_fneg_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SAFE-NEXT: v_sub_f32_e32 v0, v0, v1 -; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_add_x_fneg_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NSZ-NEXT: v_sub_f32_e32 v0, v1, v0 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_add_x_fneg_f16: @@ -342,22 +333,20 @@ define half @v_fneg_add_fneg_fneg_f16(half %a, half %b) #0 { ; SI-SAFE-LABEL: v_fneg_add_fneg_fneg_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-SAFE-NEXT: v_sub_f32_e32 v0, v0, v1 -; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_add_fneg_fneg_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NSZ-NEXT: v_add_f32_e32 v0, v0, v1 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_add_fneg_fneg_f16: @@ -409,24 +398,25 @@ define { half, half } @v_fneg_add_store_use_fneg_x_f16(half %a, half %b) #0 { ; SI-SAFE-LABEL: v_fneg_add_store_use_fneg_x_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v1, -v0 -; SI-SAFE-NEXT: v_sub_f32_e32 v0, v3, v2 -; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-SAFE-NEXT: v_sub_f32_e32 v1, v1, v2 +; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v2, -v1 +; SI-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0 +; SI-SAFE-NEXT: v_mov_b32_e32 v0, v2 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_add_store_use_fneg_x_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, -v0 -; SI-NSZ-NEXT: v_sub_f32_e32 v0, v3, v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NSZ-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NSZ-NEXT: v_sub_f32_e32 v1, v2, v1 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NSZ-NEXT: v_xor_b32_e32 v1, 0x8000, v0 +; SI-NSZ-NEXT: v_mov_b32_e32 v0, v2 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_add_store_use_fneg_x_f16: @@ -493,30 +483,29 @@ define { half, half } @v_fneg_add_multi_use_fneg_x_f16(half %a, half %b, half %c ; SI-SAFE-LABEL: v_fneg_add_multi_use_fneg_x_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SAFE-NEXT: v_and_b32_e32 v3, 0xffff, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v4, -v0 -; SI-SAFE-NEXT: v_sub_f32_e32 v0, v1, v3 -; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; SI-SAFE-NEXT: v_mul_f32_e32 v1, v4, v2 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v3, -v3 +; SI-SAFE-NEXT: v_sub_f32_e32 v0, v1, v0 +; SI-SAFE-NEXT: v_mul_f32_e32 v1, v3, v2 +; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_add_multi_use_fneg_x_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_and_b32_e32 v3, 0xffff, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v4, -v0 -; SI-NSZ-NEXT: v_sub_f32_e32 v0, v3, v1 -; SI-NSZ-NEXT: v_mul_f32_e32 v1, v4, v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v3, -v3 +; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-NSZ-NEXT: v_mul_f32_e32 v1, v3, v2 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_add_multi_use_fneg_x_f16: @@ -584,11 +573,8 @@ define { half, half } @v_fneg_add_multi_use_fneg_x_f16(half %a, half %b, half %c define amdgpu_ps half @fneg_fadd_0_safe_f16(half inreg %tmp2, half inreg %tmp6, <4 x i32> %arg) #0 { ; SI-LABEL: fneg_fadd_0_safe_f16: ; SI: ; %bb.0: ; %.entry -; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 ; SI-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, 1.0 ; SI-NEXT: v_rcp_f32_e32 v3, v2 ; SI-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 @@ -608,6 +594,8 @@ define amdgpu_ps half @fneg_fadd_0_safe_f16(half inreg %tmp2, half inreg %tmp6, ; SI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: fneg_fadd_0_safe_f16: @@ -654,17 +642,16 @@ define amdgpu_ps half @fneg_fadd_0_safe_f16(half inreg %tmp2, half inreg %tmp6, define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, <4 x i32> %arg) #2 { ; SI-LABEL: fneg_fadd_0_nsz_f16: ; SI: ; %bb.0: ; %.entry -; SI-NEXT: v_cvt_f16_f32_e32 v0, s1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 ; SI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_rcp_f32_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 ; SI-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: fneg_fadd_0_nsz_f16: @@ -712,11 +699,11 @@ define half @v_fneg_mul_f16(half %a, half %b) #0 { ; SI-LABEL: v_fneg_mul_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_mul_f16: @@ -749,12 +736,11 @@ define { half, half } @v_fneg_mul_store_use_mul_f16(half %a, half %b) #0 { ; SI-LABEL: v_fneg_mul_store_use_mul_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v1, v0, v1 -; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v1 +; SI-NEXT: v_cvt_f16_f32_e64 v0, -v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_mul_store_use_mul_f16: @@ -796,12 +782,13 @@ define { half, half } @v_fneg_mul_multi_use_mul_f16(half %a, half %b) #0 { ; SI-LABEL: v_fneg_mul_multi_use_mul_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_mul_multi_use_mul_f16: @@ -844,11 +831,10 @@ define half @v_fneg_mul_fneg_x_f16(half %a, half %b) #0 { ; SI-LABEL: v_fneg_mul_fneg_x_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_mul_fneg_x_f16: @@ -882,11 +868,10 @@ define half @v_fneg_mul_x_fneg_f16(half %a, half %b) #0 { ; SI-LABEL: v_fneg_mul_x_fneg_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_mul_x_fneg_f16: @@ -920,11 +905,11 @@ define half @v_fneg_mul_fneg_fneg_f16(half %a, half %b) #0 { ; SI-LABEL: v_fneg_mul_fneg_fneg_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_mul_fneg_fneg_f16: @@ -959,12 +944,13 @@ define { half, half } @v_fneg_mul_store_use_fneg_x_f16(half %a, half %b) #0 { ; SI-LABEL: v_fneg_mul_store_use_fneg_x_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_cvt_f32_f16_e64 v1, -v0 -; SI-NEXT: v_mul_f32_e32 v0, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mul_f32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_xor_b32_e32 v1, 0x8000, v0 +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_mul_store_use_fneg_x_f16: @@ -1011,15 +997,15 @@ define { half, half } @v_fneg_mul_multi_use_fneg_x_f16(half %a, half %b, half %c ; SI-LABEL: v_fneg_mul_multi_use_fneg_x_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_cvt_f32_f16_e64 v4, -v0 -; SI-NEXT: v_mul_f32_e32 v0, v3, v1 -; SI-NEXT: v_mul_f32_e32 v1, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v3, -v3 +; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: v_mul_f32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_mul_multi_use_fneg_x_f16: @@ -1071,11 +1057,12 @@ define half @v_fneg_minnum_f16_ieee(half %a, half %b) #0 { ; SI-LABEL: v_fneg_minnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1 -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_minnum_f16_ieee: @@ -1119,11 +1106,12 @@ define half @v_fneg_minnum_f16_no_ieee(half %a, half %b) #4 { ; SI-LABEL: v_fneg_minnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1 -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_minnum_f16_no_ieee: @@ -1156,7 +1144,8 @@ define half @v_fneg_self_minnum_f16_ieee(half %a) #0 { ; SI-LABEL: v_fneg_self_minnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_self_minnum_f16_ieee: @@ -1189,7 +1178,8 @@ define half @v_fneg_self_minnum_f16_no_ieee(half %a) #4 { ; SI-LABEL: v_fneg_self_minnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_self_minnum_f16_no_ieee: @@ -1222,9 +1212,10 @@ define half @v_fneg_posk_minnum_f16_ieee(half %a) #0 { ; SI-LABEL: v_fneg_posk_minnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, -4.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_posk_minnum_f16_ieee: @@ -1264,9 +1255,10 @@ define half @v_fneg_posk_minnum_f16_no_ieee(half %a) #4 { ; SI-LABEL: v_fneg_posk_minnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, -4.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_posk_minnum_f16_no_ieee: @@ -1299,9 +1291,10 @@ define half @v_fneg_negk_minnum_f16_ieee(half %a) #0 { ; SI-LABEL: v_fneg_negk_minnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_negk_minnum_f16_ieee: @@ -1341,9 +1334,10 @@ define half @v_fneg_negk_minnum_f16_no_ieee(half %a) #4 { ; SI-LABEL: v_fneg_negk_minnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_negk_minnum_f16_no_ieee: @@ -1376,10 +1370,9 @@ define half @v_fneg_0_minnum_f16(half %a) #0 { ; SI-LABEL: v_fneg_0_minnum_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_min_f32_e32 v0, 0, v0 -; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_0_minnum_f16: @@ -1419,9 +1412,10 @@ define half @v_fneg_neg0_minnum_f16_ieee(half %a) #0 { ; SI-LABEL: v_fneg_neg0_minnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_neg0_minnum_f16_ieee: @@ -1461,9 +1455,10 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 { ; SI-LABEL: v_fneg_inv2pi_minnum_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_inv2pi_minnum_f16: @@ -1507,9 +1502,10 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 { ; SI-LABEL: v_fneg_neg_inv2pi_minnum_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_neg_inv2pi_minnum_f16: @@ -1553,9 +1549,10 @@ define half @v_fneg_neg0_minnum_f16_no_ieee(half %a) #4 { ; SI-LABEL: v_fneg_neg0_minnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_neg0_minnum_f16_no_ieee: @@ -1588,12 +1585,11 @@ define half @v_fneg_0_minnum_foldable_use_f16_ieee(half %a, half %b) #0 { ; SI-LABEL: v_fneg_0_minnum_foldable_use_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_min_f32_e32 v0, 0, v0 ; SI-NEXT: v_mul_f32_e64 v0, -v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_0_minnum_foldable_use_f16_ieee: @@ -1638,12 +1634,12 @@ define half @v_fneg_inv2pi_minnum_foldable_use_f16(half %a, half %b) #0 { ; SI-LABEL: v_fneg_inv2pi_minnum_foldable_use_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_inv2pi_minnum_foldable_use_f16: @@ -1688,12 +1684,11 @@ define half @v_fneg_0_minnum_foldable_use_f16_no_ieee(half %a, half %b) #4 { ; SI-LABEL: v_fneg_0_minnum_foldable_use_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_min_f32_e32 v0, 0, v0 ; SI-NEXT: v_mul_f32_e64 v0, -v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_0_minnum_foldable_use_f16_no_ieee: @@ -1734,12 +1729,14 @@ define { half, half } @v_fneg_minnum_multi_use_minnum_f16_ieee(half %a, half %b) ; SI-LABEL: v_fneg_minnum_multi_use_minnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_minnum_multi_use_minnum_f16_ieee: @@ -1790,8 +1787,6 @@ define <2 x half> @v_fneg_minnum_multi_use_minnum_f16_no_ieee(half %a, half %b) ; SI-LABEL: v_fneg_minnum_multi_use_minnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 @@ -1851,11 +1846,12 @@ define half @v_fneg_maxnum_f16_ieee(half %a, half %b) #0 { ; SI-LABEL: v_fneg_maxnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1 -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_maxnum_f16_ieee: @@ -1899,11 +1895,12 @@ define half @v_fneg_maxnum_f16_no_ieee(half %a, half %b) #4 { ; SI-LABEL: v_fneg_maxnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1 -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_maxnum_f16_no_ieee: @@ -1936,7 +1933,8 @@ define half @v_fneg_self_maxnum_f16_ieee(half %a) #0 { ; SI-LABEL: v_fneg_self_maxnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_self_maxnum_f16_ieee: @@ -1969,7 +1967,8 @@ define half @v_fneg_self_maxnum_f16_no_ieee(half %a) #4 { ; SI-LABEL: v_fneg_self_maxnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_self_maxnum_f16_no_ieee: @@ -2002,9 +2001,10 @@ define half @v_fneg_posk_maxnum_f16_ieee(half %a) #0 { ; SI-LABEL: v_fneg_posk_maxnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, -4.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_posk_maxnum_f16_ieee: @@ -2044,9 +2044,10 @@ define half @v_fneg_posk_maxnum_f16_no_ieee(half %a) #4 { ; SI-LABEL: v_fneg_posk_maxnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, -4.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_posk_maxnum_f16_no_ieee: @@ -2079,9 +2080,10 @@ define half @v_fneg_negk_maxnum_f16_ieee(half %a) #0 { ; SI-LABEL: v_fneg_negk_maxnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_negk_maxnum_f16_ieee: @@ -2121,9 +2123,10 @@ define half @v_fneg_negk_maxnum_f16_no_ieee(half %a) #4 { ; SI-LABEL: v_fneg_negk_maxnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_negk_maxnum_f16_no_ieee: @@ -2156,10 +2159,9 @@ define half @v_fneg_0_maxnum_f16(half %a) #0 { ; SI-LABEL: v_fneg_0_maxnum_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0, v0 -; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_0_maxnum_f16: @@ -2199,9 +2201,10 @@ define half @v_fneg_neg0_maxnum_f16_ieee(half %a) #0 { ; SI-LABEL: v_fneg_neg0_maxnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, 0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_neg0_maxnum_f16_ieee: @@ -2241,9 +2244,10 @@ define half @v_fneg_neg0_maxnum_f16_no_ieee(half %a) #4 { ; SI-LABEL: v_fneg_neg0_maxnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, 0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_neg0_maxnum_f16_no_ieee: @@ -2276,12 +2280,11 @@ define half @v_fneg_0_maxnum_foldable_use_f16_ieee(half %a, half %b) #0 { ; SI-LABEL: v_fneg_0_maxnum_foldable_use_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_max_f32_e32 v0, 0, v0 ; SI-NEXT: v_mul_f32_e64 v0, -v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_0_maxnum_foldable_use_f16_ieee: @@ -2326,12 +2329,11 @@ define half @v_fneg_0_maxnum_foldable_use_f16_no_ieee(half %a, half %b) #4 { ; SI-LABEL: v_fneg_0_maxnum_foldable_use_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_max_f32_e32 v0, 0, v0 ; SI-NEXT: v_mul_f32_e64 v0, -v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_0_maxnum_foldable_use_f16_no_ieee: @@ -2372,12 +2374,14 @@ define { half, half } @v_fneg_maxnum_multi_use_maxnum_f16_ieee(half %a, half %b) ; SI-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_ieee: @@ -2428,8 +2432,6 @@ define <2 x half> @v_fneg_maxnum_multi_use_maxnum_f16_no_ieee(half %a, half %b) ; SI-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 @@ -2489,26 +2491,21 @@ define half @v_fneg_fma_f16(half %a, half %b, half %c) #0 { ; SI-SAFE-LABEL: v_fneg_fma_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SAFE-NEXT: v_fma_f32 v0, v0, v1, v2 -; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NSZ-NEXT: v_fma_f32 v0, v0, -v1, -v2 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fma_f16: @@ -2558,14 +2555,12 @@ define { half, half } @v_fneg_fma_store_use_fma_f16(half %a, half %b, half %c) # ; SI-LABEL: v_fneg_fma_store_use_fma_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_fma_f32 v1, v0, v1, v2 -; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v1 +; SI-NEXT: v_cvt_f16_f32_e64 v0, -v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_fma_store_use_fma_f16: @@ -2607,28 +2602,25 @@ define { half, half } @v_fneg_fma_multi_use_fma_f16(half %a, half %b, half %c) # ; SI-SAFE-LABEL: v_fneg_fma_multi_use_fma_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_fma_f32 v1, v0, v1, v2 -; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v1 -; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v1 +; SI-SAFE-NEXT: v_fma_f32 v0, v0, v1, v2 +; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_multi_use_fma_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NSZ-NEXT: v_fma_f32 v0, v0, -v1, -v2 ; SI-NSZ-NEXT: v_mul_f32_e32 v1, -4.0, v0 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fma_multi_use_fma_f16: @@ -2689,26 +2681,21 @@ define half @v_fneg_fma_fneg_x_y_f16(half %a, half %b, half %c) #0 { ; SI-SAFE-LABEL: v_fneg_fma_fneg_x_y_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SAFE-NEXT: v_fma_f32 v0, -v0, v1, v2 -; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_fneg_x_y_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NSZ-NEXT: v_fma_f32 v0, v0, v1, -v2 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fma_fneg_x_y_f16: @@ -2759,26 +2746,21 @@ define half @v_fneg_fma_x_fneg_y_f16(half %a, half %b, half %c) #0 { ; SI-SAFE-LABEL: v_fneg_fma_x_fneg_y_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SAFE-NEXT: v_fma_f32 v0, v0, -v1, v2 -; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_x_fneg_y_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NSZ-NEXT: v_fma_f32 v0, v0, v1, -v2 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fma_x_fneg_y_f16: @@ -2829,26 +2811,21 @@ define half @v_fneg_fma_fneg_fneg_y_f16(half %a, half %b, half %c) #0 { ; SI-SAFE-LABEL: v_fneg_fma_fneg_fneg_y_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SAFE-NEXT: v_fma_f32 v0, v0, v1, v2 -; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_fneg_fneg_y_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NSZ-NEXT: v_fma_f32 v0, v0, -v1, -v2 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fma_fneg_fneg_y_f16: @@ -2900,26 +2877,21 @@ define half @v_fneg_fma_fneg_x_fneg_f16(half %a, half %b, half %c) #0 { ; SI-SAFE-LABEL: v_fneg_fma_fneg_x_fneg_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SAFE-NEXT: v_fma_f32 v0, -v0, v1, -v2 -; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_fneg_x_fneg_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NSZ-NEXT: v_fma_f32 v0, v0, v1, v2 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fma_fneg_x_fneg_f16: @@ -2971,26 +2943,21 @@ define half @v_fneg_fma_x_y_fneg_f16(half %a, half %b, half %c) #0 { ; SI-SAFE-LABEL: v_fneg_fma_x_y_fneg_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SAFE-NEXT: v_fma_f32 v0, v0, v1, -v2 -; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_x_y_fneg_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NSZ-NEXT: v_fma_f32 v0, v0, -v1, v2 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fma_x_y_fneg_f16: @@ -3041,28 +3008,26 @@ define { half, half } @v_fneg_fma_store_use_fneg_x_y_f16(half %a, half %b, half ; SI-SAFE-LABEL: v_fneg_fma_store_use_fneg_x_y_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v3, -v0 +; SI-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v0 -; SI-SAFE-NEXT: v_fma_f32 v0, v3, v4, v2 -; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-SAFE-NEXT: v_fma_f32 v0, v0, v3, v2 +; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_store_use_fneg_x_y_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, -v0 -; SI-NSZ-NEXT: v_fma_f32 v0, v4, v3, -v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NSZ-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NSZ-NEXT: v_fma_f32 v1, v3, v1, -v2 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NSZ-NEXT: v_xor_b32_e32 v1, 0x8000, v0 +; SI-NSZ-NEXT: v_mov_b32_e32 v0, v2 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fma_store_use_fneg_x_y_f16: @@ -3129,33 +3094,30 @@ define { half, half } @v_fneg_fma_multi_use_fneg_x_y_f16(half %a, half %b, half ; SI-SAFE-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SAFE-NEXT: v_fma_f32 v0, v4, v1, v2 -; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; SI-SAFE-NEXT: v_mul_f32_e32 v1, v4, v3 +; SI-SAFE-NEXT: v_fma_f32 v1, v0, v1, v2 +; SI-SAFE-NEXT: v_mul_f32_e32 v2, v0, v3 +; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v1 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v2 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v5, -v0 -; SI-NSZ-NEXT: v_fma_f32 v0, v4, v1, -v2 -; SI-NSZ-NEXT: v_mul_f32_e32 v1, v5, v3 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v4, -v4 +; SI-NSZ-NEXT: v_fma_f32 v0, v0, v1, -v2 +; SI-NSZ-NEXT: v_mul_f32_e32 v1, v4, v3 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16: @@ -3227,26 +3189,22 @@ define half @v_fneg_fmad_f16(half %a, half %b, half %c) #0 { ; SI-SAFE-LABEL: v_fneg_fmad_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SAFE-NEXT: v_mac_f32_e32 v2, v0, v1 -; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v2 +; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v2 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fmad_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NSZ-NEXT: v_cvt_f16_f32_e64 v1, -v1 +; SI-NSZ-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; SI-NSZ-NEXT: v_mad_f32 v0, v0, v1, -v2 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fmad_f16: @@ -3432,28 +3390,26 @@ define { half, half } @v_fneg_fmad_multi_use_fmad_f16(half %a, half %b, half %c) ; SI-SAFE-LABEL: v_fneg_fmad_multi_use_fmad_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SAFE-NEXT: v_mac_f32_e32 v2, v0, v1 -; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v2 ; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v2 +; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v2 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_fmad_multi_use_fmad_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NSZ-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; SI-NSZ-NEXT: v_mad_f32 v0, v0, v1, -v2 ; SI-NSZ-NEXT: v_mul_f32_e32 v1, -4.0, v0 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_fmad_multi_use_fmad_f16: @@ -3518,7 +3474,9 @@ define double @v_fneg_fp_extend_f16_to_f64(half %a) #0 { ; SI-LABEL: v_fneg_fp_extend_f16_to_f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f64_f32_e64 v[0:1], -v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_fp_extend_f16_to_f64: @@ -3562,6 +3520,7 @@ define double @v_fneg_fp_extend_fneg_f16_to_f64(half %a) #0 { ; SI-LABEL: v_fneg_fp_extend_fneg_f16_to_f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3604,8 +3563,10 @@ define { double, half } @v_fneg_fp_extend_store_use_fneg_f16_to_f64(half %a) #0 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 -; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_fp_extend_store_use_fneg_f16_to_f64: @@ -3656,6 +3617,7 @@ define { double, double } @v_fneg_multi_use_fp_extend_fneg_f16_to_f64(half %a) # ; SI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 ; SI-NEXT: v_xor_b32_e32 v1, 0x80000000, v3 ; SI-NEXT: v_mov_b32_e32 v0, v2 @@ -3711,6 +3673,7 @@ define { double, double } @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64(h ; SI-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; SI-NEXT: v_xor_b32_e32 v4, 0x80000000, v1 ; SI-NEXT: v_mul_f64 v[2:3], v[0:1], 4.0 @@ -3765,8 +3728,9 @@ define { float, float } @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(half %a) #0 ; SI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, v0 -; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v1 +; SI-NEXT: v_cvt_f32_f16_e64 v2, -v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f32: @@ -3856,7 +3820,7 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 { ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_fp_round_f64_to_f16: @@ -4008,7 +3972,7 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 { ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_fp_round_fneg_f64_to_f16: @@ -4120,48 +4084,48 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0 ; SI-NEXT: v_and_b32_e32 v0, 0x1ff, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: v_and_b32_e32 v2, 0xffe, v2 -; SI-NEXT: v_bfe_u32 v4, v1, 20, 11 +; SI-NEXT: v_and_b32_e32 v4, 0xffe, v4 +; SI-NEXT: v_bfe_u32 v5, v1, 20, 11 ; SI-NEXT: s_movk_i32 s4, 0x3f1 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_sub_i32_e32 v5, vcc, s4, v4 -; SI-NEXT: v_or_b32_e32 v2, 0x1000, v0 -; SI-NEXT: v_med3_i32 v5, v5, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v6, v5, v2 -; SI-NEXT: v_lshlrev_b32_e32 v5, v5, v6 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v5, v2 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_sub_i32_e32 v6, vcc, s4, v5 +; SI-NEXT: v_or_b32_e32 v4, 0x1000, v0 +; SI-NEXT: v_med3_i32 v6, v6, 0, 13 +; SI-NEXT: v_lshrrev_b32_e32 v7, v6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, v6, v7 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, v6, v4 ; SI-NEXT: s_movk_i32 s4, 0xfc10 -; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 12, v4 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v5, v0, v5 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4 -; SI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; SI-NEXT: v_and_b32_e32 v5, 7, v2 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, s4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 12, v5 +; SI-NEXT: v_or_b32_e32 v4, v7, v4 +; SI-NEXT: v_or_b32_e32 v6, v0, v6 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 +; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; SI-NEXT: v_and_b32_e32 v6, 7, v4 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 ; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 -; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; SI-NEXT: v_mov_b32_e32 v5, 0x7c00 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4 -; SI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; SI-NEXT: v_mov_b32_e32 v6, 0x7e00 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshrrev_b32_e32 v4, 2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; SI-NEXT: v_mov_b32_e32 v6, 0x7c00 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5 +; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; SI-NEXT: v_mov_b32_e32 v7, 0x7e00 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_movk_i32 s4, 0x40f -; SI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v4 -; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_mov_b32_e32 v1, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4423,7 +4387,7 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; SI-NEXT: v_and_b32_e32 v5, 0x8000, v5 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_mul_f64 v[1:2], -v[0:1], v[2:3] ; SI-NEXT: v_mov_b32_e32 v0, v4 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -4582,10 +4546,9 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 { ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_multi_use_fp_round_fneg_f64_to_f16: @@ -4798,9 +4761,10 @@ define half @v_fneg_trunc_f16(half %a) #0 { ; SI-LABEL: v_fneg_trunc_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_trunc_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_trunc_f16: @@ -4837,7 +4801,6 @@ define half @v_fneg_round_f16(half %a) #0 { ; SI-SAFE-LABEL: v_fneg_round_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SAFE-NEXT: v_trunc_f32_e32 v1, v0 ; SI-SAFE-NEXT: v_sub_f32_e32 v2, v0, v1 @@ -4846,13 +4809,12 @@ define half @v_fneg_round_f16(half %a) #0 { ; SI-SAFE-NEXT: s_brev_b32 s4, -2 ; SI-SAFE-NEXT: v_bfi_b32 v0, s4, v2, v0 ; SI-SAFE-NEXT: v_add_f32_e32 v0, v1, v0 -; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: v_fneg_round_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NSZ-NEXT: v_trunc_f32_e32 v1, v0 ; SI-NSZ-NEXT: v_sub_f32_e32 v2, v0, v1 @@ -4861,6 +4823,7 @@ define half @v_fneg_round_f16(half %a) #0 { ; SI-NSZ-NEXT: s_brev_b32 s4, -2 ; SI-NSZ-NEXT: v_bfi_b32 v0, s4, v2, v0 ; SI-NSZ-NEXT: v_sub_f32_e64 v0, -v1, v0 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_round_f16: @@ -4964,9 +4927,10 @@ define half @v_fneg_rint_f16(half %a) #0 { ; SI-LABEL: v_fneg_rint_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_rndne_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_rint_f16: @@ -5003,9 +4967,10 @@ define half @v_fneg_nearbyint_f16(half %a) #0 { ; SI-LABEL: v_fneg_nearbyint_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_rndne_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_nearbyint_f16: @@ -5042,11 +5007,12 @@ define half @v_fneg_sin_f16(half %a) #0 { ; SI-LABEL: v_fneg_sin_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_mul_f32_e32 v0, 0x3e22f983, v0 ; SI-NEXT: v_fract_f32_e32 v0, v0 ; SI-NEXT: v_sin_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_sin_f16: @@ -5091,8 +5057,9 @@ define half @v_fneg_canonicalize_f16(half %a) #0 { ; SI-LABEL: v_fneg_canonicalize_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_canonicalize_f16: @@ -5129,12 +5096,10 @@ define void @v_fneg_copytoreg_f16(ptr addrspace(1) %out, half %a, half %b, half ; SI-LABEL: v_fneg_copytoreg_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_and_b32_e32 v6, 0x3ff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v6, 1, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_and_b32_e32 v6, 0x3ff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v6, 1, v6 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; SI-NEXT: v_mul_f32_e32 v2, v2, v3 @@ -5142,11 +5107,12 @@ define void @v_fneg_copytoreg_f16(ptr addrspace(1) %out, half %a, half %b, half ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_cbranch_execz .LBB81_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: v_cvt_f16_f32_e64 v3, -v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-NEXT: v_cvt_f16_f32_e64 v4, -v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_mul_f32_e32 v3, v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_mul_f32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: flat_store_short v[0:1], v3 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -5269,14 +5235,13 @@ define half @v_fneg_inlineasm_f16(half %a, half %b, half %c, i32 %d) #0 { ; SI-LABEL: v_fneg_inlineasm_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: ;;#ASMSTART -; SI-NEXT: ; use v1 +; SI-NEXT: ; use v0 ; SI-NEXT: ;;#ASMEND ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5328,12 +5293,11 @@ define half @v_fneg_inlineasm_multi_use_src_f16(ptr addrspace(1) %out, half %a, ; SI-LABEL: v_fneg_inlineasm_multi_use_src_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f16_f32_e64 v1, -v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 -; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_mul_f32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f16_f32_e64 v1, -v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: ;;#ASMSTART ; SI-NEXT: ; use v1 ; SI-NEXT: ;;#ASMEND @@ -5398,14 +5362,13 @@ define { half, half } @multiuse_fneg_2_vop3_users_f16(half %a, half %b, half %c) ; SI-LABEL: multiuse_fneg_2_vop3_users_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 -; SI-NEXT: v_fma_f32 v0, -v3, v1, v2 -; SI-NEXT: v_fma_f32 v1, -v3, v2, 2.0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_fma_f32 v1, -v0, v1, v2 +; SI-NEXT: v_fma_f32 v2, -v0, v2, 2.0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: multiuse_fneg_2_vop3_users_f16: @@ -5454,14 +5417,14 @@ define { half, half } @multiuse_fneg_2_vop2_users_f16(half %a, half %b, half %c) ; SI-LABEL: multiuse_fneg_2_vop2_users_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_mul_f32_e32 v0, v3, v1 -; SI-NEXT: v_mul_f32_e32 v1, v3, v2 +; SI-NEXT: v_mul_f32_e32 v1, v0, v1 +; SI-NEXT: v_mul_f32_e32 v2, v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: multiuse_fneg_2_vop2_users_f16: @@ -5509,14 +5472,14 @@ define { half, half } @multiuse_fneg_vop2_vop3_users_f16(ptr addrspace(1) %out, ; SI-LABEL: multiuse_fneg_vop2_vop3_users_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_cvt_f16_f32_e64 v1, -v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_fma_f32 v0, v1, v0, 2.0 -; SI-NEXT: v_mul_f32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; SI-NEXT: v_fma_f32 v1, v0, v1, 2.0 +; SI-NEXT: v_mul_f32_e32 v2, v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: multiuse_fneg_vop2_vop3_users_f16: @@ -5563,33 +5526,29 @@ define { half, half } @free_fold_src_code_size_cost_use_f16(ptr addrspace(1) %ou ; SI-SAFE-LABEL: free_fold_src_code_size_cost_use_f16: ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v4 +; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-SAFE-NEXT: v_fma_f32 v0, v1, v0, 2.0 +; SI-SAFE-NEXT: v_mul_f32_e64 v1, -v0, v2 +; SI-SAFE-NEXT: v_mul_f32_e64 v2, -v0, v3 +; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v1 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v4 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v5 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-SAFE-NEXT: v_fma_f32 v1, v1, v0, 2.0 -; SI-SAFE-NEXT: v_mul_f32_e64 v0, -v1, v2 -; SI-SAFE-NEXT: v_mul_f32_e64 v1, -v1, v3 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; SI-NSZ-LABEL: free_fold_src_code_size_cost_use_f16: ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v4 +; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-NSZ-NEXT: v_fma_f32 v0, v1, -v0, -2.0 +; SI-NSZ-NEXT: v_mul_f32_e32 v1, v0, v2 +; SI-NSZ-NEXT: v_mul_f32_e32 v2, v0, v3 +; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v1 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v4 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v3, v5 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NSZ-NEXT: v_fma_f32 v1, v1, -v0, -2.0 -; SI-NSZ-NEXT: v_mul_f32_e32 v0, v1, v2 -; SI-NSZ-NEXT: v_mul_f32_e32 v1, v1, v3 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: free_fold_src_code_size_cost_use_f16: @@ -5661,14 +5620,12 @@ define half @one_use_cost_to_fold_into_src_f16(ptr addrspace(1) %out, half %a, h ; SI-LABEL: one_use_cost_to_fold_into_src_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 ; SI-NEXT: v_trunc_f32_e32 v1, v1 ; SI-NEXT: v_fma_f32 v0, -v1, v2, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: one_use_cost_to_fold_into_src_f16: @@ -5713,17 +5670,15 @@ define { half, half } @multi_use_cost_to_fold_into_src(ptr addrspace(1) %out, ha ; SI-LABEL: multi_use_cost_to_fold_into_src: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 ; SI-NEXT: v_trunc_f32_e32 v1, v1 ; SI-NEXT: v_fma_f32 v0, -v1, v2, v0 ; SI-NEXT: v_mul_f32_e32 v1, v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: multi_use_cost_to_fold_into_src: @@ -5841,11 +5796,11 @@ define half @nnan_fmul_neg1_to_fneg(half %x, half %y) #0 { ; SI-LABEL: nnan_fmul_neg1_to_fneg: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: nnan_fmul_neg1_to_fneg: @@ -5880,11 +5835,11 @@ define half @denormal_fmul_neg1_to_fneg(half %x, half %y) { ; SI-LABEL: denormal_fmul_neg1_to_fneg: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: denormal_fmul_neg1_to_fneg: @@ -5918,13 +5873,13 @@ define half @denorm_snan_fmul_neg1_to_fneg(half %x, half %y) { ; SI-LABEL: denorm_snan_fmul_neg1_to_fneg: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v2, -v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mul_f32_e32 v0, v2, v0 +; SI-NEXT: v_mul_f32_e32 v0, v0, v2 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: denorm_snan_fmul_neg1_to_fneg: @@ -5965,11 +5920,11 @@ define half @flush_snan_fmul_neg1_to_fneg(half %x, half %y) #0 { ; SI-LABEL: flush_snan_fmul_neg1_to_fneg: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: flush_snan_fmul_neg1_to_fneg: @@ -6010,15 +5965,13 @@ define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) { ; SI-LABEL: fadd_select_fneg_fneg_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SI-NEXT: v_sub_f32_e32 v0, v3, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: fadd_select_fneg_fneg_f16: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll index bdea710725ace..5d23f648f707b 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -7777,15 +7777,13 @@ define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) { ; SI-LABEL: fadd_select_fneg_fneg_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SI-NEXT: v_sub_f32_e32 v0, v3, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: fadd_select_fneg_fneg_f16: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index 58adbd4d0d250..afe0b8c3b392b 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -938,9 +938,10 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 { ; SI-LABEL: v_fneg_inv2pi_minnum_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_inv2pi_minnum_f16: @@ -959,9 +960,10 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 { ; SI-LABEL: v_fneg_neg_inv2pi_minnum_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 0x3e230000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_neg_inv2pi_minnum_f16: @@ -1489,12 +1491,13 @@ define half @v_fneg_inv2pi_minimum_f16(half %a) #0 { ; SI-LABEL: v_fneg_inv2pi_minimum_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_max_f32_e32 v2, 0xbe230000, v0 ; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_inv2pi_minimum_f16: @@ -1515,12 +1518,13 @@ define half @v_fneg_neg_inv2pi_minimum_f16(half %a) #0 { ; SI-LABEL: v_fneg_neg_inv2pi_minimum_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_max_f32_e32 v2, 0x3e230000, v0 ; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_neg_inv2pi_minimum_f16: @@ -2080,9 +2084,10 @@ define half @v_fneg_inv2pi_minimumnum_f16(half %a) #0 { ; SI-LABEL: v_fneg_inv2pi_minimumnum_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_inv2pi_minimumnum_f16: @@ -2101,9 +2106,10 @@ define half @v_fneg_neg_inv2pi_minimumnum_f16(half %a) #0 { ; SI-LABEL: v_fneg_neg_inv2pi_minimumnum_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: v_max_f32_e32 v0, 0x3e230000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_neg_inv2pi_minimumnum_f16: @@ -2809,8 +2815,9 @@ define { float, float } @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(half %a) #0 ; SI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, v0 -; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v1 +; SI-NEXT: v_cvt_f32_f16_e64 v2, -v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f32: @@ -2830,9 +2837,9 @@ define { float, float } @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(hal ; SI-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 -; SI-NEXT: v_mul_f32_e32 v1, 4.0, v0 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SI-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32: @@ -2921,36 +2928,22 @@ define { float, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f32(double %a, d } define half @v_fneg_fp_round_f32_to_f16(float %a) #0 { -; SI-LABEL: v_fneg_fp_round_f32_to_f16: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_fneg_fp_round_f32_to_f16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; VI-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_fneg_fp_round_f32_to_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; GCN-NEXT: s_setpc_b64 s[30:31] %fpround = fptrunc float %a to half %fneg = fneg half %fpround ret half %fneg } define half @v_fneg_fp_round_fneg_f32_to_f16(float %a) #0 { -; SI-LABEL: v_fneg_fp_round_fneg_f32_to_f16: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_fneg_fp_round_fneg_f32_to_f16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_fneg_fp_round_fneg_f32_to_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] %fneg.a = fneg float %a %fpround = fptrunc float %fneg.a to half %fneg = fneg half %fpround @@ -2972,22 +2965,13 @@ define { float, float } @v_fneg_multi_use_fp_round_fneg_f64_to_f32(double %a) #0 } define { half, float } @v_fneg_fp_round_store_use_fneg_f32_to_f16(float %a) #0 { -; SI-LABEL: v_fneg_fp_round_store_use_fneg_f32_to_f16: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_xor_b32_e32 v1, 0x80000000, v0 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_fneg_fp_round_store_use_fneg_f32_to_f16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v0 -; VI-NEXT: v_mov_b32_e32 v0, v2 -; VI-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_fneg_fp_round_store_use_fneg_f32_to_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v0 +; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v0 +; GCN-NEXT: v_mov_b32_e32 v0, v2 +; GCN-NEXT: s_setpc_b64 s[30:31] %fneg.a = fneg float %a %fpround = fptrunc float %fneg.a to half %fneg = fneg half %fpround @@ -2997,22 +2981,13 @@ define { half, float } @v_fneg_fp_round_store_use_fneg_f32_to_f16(float %a) #0 { } define { half, float } @v_fneg_fp_round_multi_use_fneg_f32_to_f16(float %a, float %c) #0 { -; SI-LABEL: v_fneg_fp_round_multi_use_fneg_f32_to_f16: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: v_mul_f32_e64 v1, -v0, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_fneg_fp_round_multi_use_fneg_f32_to_f16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; VI-NEXT: v_mul_f32_e64 v1, -v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, v2 -; VI-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_fneg_fp_round_multi_use_fneg_f32_to_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v0 +; GCN-NEXT: v_mul_f32_e64 v1, -v0, v1 +; GCN-NEXT: v_mov_b32_e32 v0, v2 +; GCN-NEXT: s_setpc_b64 s[30:31] %fneg.a = fneg float %a %fpround = fptrunc float %fneg.a to half %fneg = fneg half %fpround @@ -3784,15 +3759,13 @@ define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) { ; SI-LABEL: fadd_select_fneg_fneg_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SI-NEXT: v_sub_f32_e32 v0, v3, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: fadd_select_fneg_fneg_f16: @@ -4176,12 +4149,12 @@ define half @v_fneg_select_infloop_regression_f16(half %arg, i1 %arg1) { ; SI-LABEL: v_fneg_select_infloop_regression_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_and_b32_e32 v1, 1, v1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; SI-NEXT: v_cndmask_b32_e64 v0, -v0, 0, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_select_infloop_regression_f16: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index e5c34f695f9a7..d9dea4f1fd6e7 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -601,16 +601,16 @@ define half @select_fneg_select_f16(i1 %cond0, i1 %cond1, half %arg0, half %arg1 ; GFX7-LABEL: select_fneg_select_f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e64 v2, -v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e64 v2, -v2 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, -v0, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: select_fneg_select_f16: @@ -1320,11 +1320,9 @@ define double @fneg_f64_bitcast_build_vector_v4f16_to_f64(half %elt0, half %elt1 ; GFX7-LABEL: fneg_f64_bitcast_build_vector_v4f16_to_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1370,14 +1368,12 @@ define double @fneg_f64_bitcast_build_vector_v4bf16_to_f64(bfloat %elt0, bfloat ; GFX7-LABEL: fneg_f64_bitcast_build_vector_v4bf16_to_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_xor_b32_e32 v1, 0x80000000, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll index c277f3b546c6b..4fe0882a19f18 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -605,6 +605,7 @@ define half @v_fneg_i16_fp_use(i16 %in) { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_i16_fp_use: diff --git a/llvm/test/CodeGen/AMDGPU/fpow.ll b/llvm/test/CodeGen/AMDGPU/fpow.ll index fd7816e7df1d9..3f1aea2e3773d 100644 --- a/llvm/test/CodeGen/AMDGPU/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/fpow.ll @@ -136,13 +136,12 @@ define half @v_pow_f16(half %x, half %y) { ; GFX6-LABEL: v_pow_f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_log_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_f16: diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll index a2bd98d3d7b27..5ff03c8dd4543 100644 --- a/llvm/test/CodeGen/AMDGPU/fract-match.ll +++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll @@ -1578,21 +1578,21 @@ define half @basic_fract_f16_nonan(half nofpclass(nan) %x) { ; GFX6-LABEL: basic_fract_f16_nonan: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_floor_f32_e32 v1, v0 ; GFX6-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX6-NEXT: v_min_f32_e32 v0, 0x3f7fe000, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: basic_fract_f16_nonan: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_floor_f32_e32 v1, v0 ; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_min_f32_e32 v0, 0x3f7fe000, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: basic_fract_f16_nonan: @@ -1859,18 +1859,18 @@ define half @safe_math_fract_f16_noinf_check(half %x, ptr addrspace(1) writeonly ; GFX6-LABEL: safe_math_fract_f16_noinf_check: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: v_floor_f32_e32 v3, v0 ; GFX6-NEXT: v_sub_f32_e32 v4, v0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_min_f32_e32 v4, 0x3f7fe000, v4 ; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1878,18 +1878,18 @@ define half @safe_math_fract_f16_noinf_check(half %x, ptr addrspace(1) writeonly ; GFX7-LABEL: safe_math_fract_f16_noinf_check: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: v_floor_f32_e32 v3, v0 ; GFX7-NEXT: v_sub_f32_e32 v4, v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_min_f32_e32 v4, 0x3f7fe000, v4 ; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -2486,46 +2486,46 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) writeonly captures(no ; GFX6-LABEL: safe_math_fract_f16: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_movk_i32 s8, 0x7c00 -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: s_movk_i32 s4, 0x7c00 +; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_floor_f32_e32 v4, v3 ; GFX6-NEXT: v_sub_f32_e32 v5, v3, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX6-NEXT: v_min_f32_e32 v5, 0x3f7fe000, v5 ; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, s8, v0 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX6-NEXT: buffer_store_short v4, v[1:2], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s6 +; GFX6-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: safe_math_fract_f16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_movk_i32 s8, 0x7c00 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: s_movk_i32 s4, 0x7c00 +; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_floor_f32_e32 v4, v3 ; GFX7-NEXT: v_sub_f32_e32 v5, v3, v4 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_min_f32_e32 v5, 0x3f7fe000, v5 ; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s8, v0 +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX7-NEXT: buffer_store_short v4, v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index 1117e7f74f11c..7ff700d2cd101 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -409,24 +409,14 @@ define void @void_func_i64(i64 %arg0) #0 { } define void @void_func_f16(half %arg0) #0 { -; CI-LABEL: void_func_f16: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_store_short v0, off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: void_func_f16: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; CIGFX89-LABEL: void_func_f16: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_f16: ; GFX11: ; %bb.0: @@ -2686,11 +2676,11 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 -; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_ubyte v34, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 -; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_ubyte v32, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt vmcnt(5) @@ -2704,25 +2694,22 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_mul_f32_e32 v12, 1.0, v32 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v33 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v0, 1, v34 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; CI-NEXT: v_and_b32_e32 v0, 1, v32 ; CI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_byte v35, off, s[4:7], 0 +; CI-NEXT: buffer_store_byte v33, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v36, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v34, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v13, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v35, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: buffer_store_short v1, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v36, off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -4434,25 +4421,14 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { define void @void_func_bf16(bfloat %arg0) #0 { -; CI-LABEL: void_func_bf16: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_store_short v0, off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: void_func_bf16: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; CIGFX89-LABEL: void_func_bf16: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_bf16: ; GFX11: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll index c923431bb17c1..4add34dd7d956 100644 --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -2259,24 +2259,14 @@ define void @void_func_sret_max_known_zero_bits(ptr addrspace(5) sret(i8) %arg0) } define bfloat @bf16_func_void() #0 { -; CI-LABEL: bf16_func_void: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; CI-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-LABEL: bf16_func_void: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 -; GFX89-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: s_setpc_b64 s[30:31] +; GFX789-LABEL: bf16_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: bf16_func_void: ; GFX11: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 8996a8d9ce4bf..d4bddf26d0ed3 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -9207,10 +9207,11 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v7, v2 @@ -9237,7 +9238,6 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: @@ -9250,10 +9250,11 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v7, v2 @@ -9280,7 +9281,6 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -9726,10 +9726,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 @@ -9756,7 +9757,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9770,10 +9770,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 @@ -9801,7 +9802,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -10250,10 +10250,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 @@ -10280,7 +10281,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10294,10 +10294,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 @@ -10325,7 +10326,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 @@ -10746,13 +10746,14 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -10787,13 +10788,14 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX6-NEXT: v_not_b32_e32 v6, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -11247,10 +11249,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 @@ -11289,10 +11292,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 @@ -11753,10 +11757,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 @@ -11795,10 +11800,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 @@ -12152,30 +12158,31 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB50_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -12186,31 +12193,32 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB50_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -12525,8 +12533,9 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12558,8 +12567,9 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13031,10 +13041,11 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 @@ -13061,7 +13072,6 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13075,10 +13085,11 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 @@ -13106,7 +13117,6 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -13542,10 +13552,11 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 @@ -13584,10 +13595,11 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 @@ -14143,6 +14155,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -14172,7 +14185,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: @@ -14186,6 +14198,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -14215,7 +14228,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -14754,6 +14766,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -14783,7 +14796,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -14798,6 +14810,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -14828,7 +14841,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -15372,6 +15384,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -15401,7 +15414,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -15416,6 +15428,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -15446,7 +15459,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 @@ -15954,22 +15966,23 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_not_b32_e32 v5, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_and_b32_e32 v7, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 @@ -15995,23 +16008,24 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX6-NEXT: v_not_b32_e32 v5, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v6 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_and_b32_e32 v7, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 @@ -16546,6 +16560,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -16588,6 +16603,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -17144,6 +17160,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -17186,6 +17203,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -17643,30 +17661,31 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB60_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -17677,31 +17696,32 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB60_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -18114,6 +18134,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 @@ -18147,6 +18168,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 @@ -18715,6 +18737,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -18744,7 +18767,6 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -18759,6 +18781,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -18789,7 +18812,6 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -19317,6 +19339,7 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -19359,6 +19382,7 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index 279cff3f5d368..bcf51f89920c0 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -4777,10 +4777,11 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v7, v2 @@ -4807,7 +4808,6 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory: @@ -4820,10 +4820,11 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v7, v2 @@ -4850,7 +4851,6 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -5233,10 +5233,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 @@ -5263,7 +5264,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5277,10 +5277,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 @@ -5308,7 +5309,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -5693,10 +5693,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 @@ -5723,7 +5724,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5737,10 +5737,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 @@ -5768,7 +5769,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 @@ -6125,13 +6125,14 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6166,13 +6167,14 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX6-NEXT: v_not_b32_e32 v6, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6567,10 +6569,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 @@ -6609,10 +6612,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 @@ -7012,10 +7016,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 @@ -7054,10 +7059,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 @@ -7366,30 +7372,31 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -7400,31 +7407,32 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB32_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -7697,8 +7705,9 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7730,8 +7739,9 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8141,10 +8151,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 @@ -8171,7 +8182,6 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8185,10 +8195,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 @@ -8216,7 +8227,6 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -8593,10 +8603,11 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 @@ -8635,10 +8646,11 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 @@ -9097,6 +9109,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -9127,7 +9140,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: @@ -9141,6 +9153,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -9171,7 +9184,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -9613,6 +9625,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -9643,7 +9656,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9658,6 +9670,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -9689,7 +9702,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -10133,6 +10145,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -10163,7 +10176,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10178,6 +10190,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -10209,7 +10222,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 @@ -10624,23 +10636,24 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_not_b32_e32 v5, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_and_b32_e32 v7, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 @@ -10666,24 +10679,25 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX6-NEXT: v_not_b32_e32 v5, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v6 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_max_f32_e32 v3, v3, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_and_b32_e32 v7, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 @@ -11125,6 +11139,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -11168,6 +11183,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -11628,6 +11644,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -11671,6 +11688,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -12052,31 +12070,32 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -12087,32 +12106,33 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_max_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB42_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -12452,6 +12472,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 @@ -12486,6 +12507,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 @@ -12957,6 +12979,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -12987,7 +13010,6 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13002,6 +13024,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -13033,7 +13056,6 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -13467,6 +13489,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -13510,6 +13533,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index e658cb658de78..9406e08e9e412 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -4777,10 +4777,11 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v7, v2 @@ -4807,7 +4808,6 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: @@ -4820,10 +4820,11 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v7, v2 @@ -4850,7 +4851,6 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -5233,10 +5233,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 @@ -5263,7 +5264,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5277,10 +5277,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 @@ -5308,7 +5309,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -5693,10 +5693,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 @@ -5723,7 +5724,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -5737,10 +5737,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 @@ -5768,7 +5769,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 @@ -6125,13 +6125,14 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6166,13 +6167,14 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX6-NEXT: v_not_b32_e32 v6, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -6567,10 +6569,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 @@ -6609,10 +6612,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 @@ -7012,10 +7016,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 @@ -7054,10 +7059,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 @@ -7366,30 +7372,31 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB32_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -7400,31 +7407,32 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB32_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -7697,8 +7705,9 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7730,8 +7739,9 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8141,10 +8151,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 @@ -8171,7 +8182,6 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8185,10 +8195,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 @@ -8216,7 +8227,6 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -8593,10 +8603,11 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 @@ -8635,10 +8646,11 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 @@ -9097,6 +9109,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -9127,7 +9140,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: @@ -9141,6 +9153,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -9171,7 +9184,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -9613,6 +9625,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -9643,7 +9656,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9658,6 +9670,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -9689,7 +9702,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -10133,6 +10145,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -10163,7 +10176,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -10178,6 +10190,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -10209,7 +10222,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 @@ -10624,23 +10636,24 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_not_b32_e32 v5, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_and_b32_e32 v7, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 @@ -10666,24 +10679,25 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX6-NEXT: v_not_b32_e32 v5, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v6 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_min_f32_e32 v3, v3, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_and_b32_e32 v7, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 @@ -11125,6 +11139,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -11168,6 +11183,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -11628,6 +11644,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -11671,6 +11688,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -12052,31 +12070,32 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB42_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: @@ -12087,32 +12106,33 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_min_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB42_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -12452,6 +12472,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 @@ -12486,6 +12507,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 @@ -12957,6 +12979,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -12987,7 +13010,6 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -13002,6 +13024,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -13033,7 +13056,6 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -13467,6 +13489,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -13510,6 +13533,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index 965c10b2e9ff9..f4b7280062bb8 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -5513,10 +5513,11 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX7-NEXT: v_not_b32_e32 v7, v2 @@ -5543,7 +5544,6 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_f16: @@ -5556,10 +5556,11 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5 ; GFX6-NEXT: v_not_b32_e32 v7, v2 @@ -5586,7 +5587,6 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst @@ -5944,10 +5944,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 @@ -5974,7 +5975,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos: @@ -5988,10 +5988,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 @@ -6019,7 +6020,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -6379,10 +6379,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 @@ -6409,7 +6410,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg: @@ -6423,10 +6423,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 @@ -6454,7 +6455,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024 @@ -6791,13 +6791,14 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_not_b32_e32 v6, v3 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6832,13 +6833,14 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX6-NEXT: v_not_b32_e32 v6, v3 -; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -7208,10 +7210,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 @@ -7250,10 +7253,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 @@ -7628,10 +7632,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 @@ -7670,10 +7675,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 @@ -7962,30 +7968,31 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB28_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4: @@ -7996,31 +8003,32 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB28_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -8273,8 +8281,9 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8306,8 +8315,9 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8692,10 +8702,11 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX7-NEXT: v_not_b32_e32 v8, v2 @@ -8722,7 +8733,6 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos: @@ -8736,10 +8746,11 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6 ; GFX6-NEXT: v_not_b32_e32 v8, v2 @@ -8767,7 +8778,6 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023 @@ -9119,10 +9129,11 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX7-NEXT: v_not_b32_e32 v6, v2 @@ -9161,10 +9172,11 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_not_b32_e32 v6, v2 @@ -9623,6 +9635,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -9652,7 +9665,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_bf16: @@ -9666,6 +9678,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -9695,7 +9708,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst @@ -10137,6 +10149,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -10166,7 +10179,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos: @@ -10181,6 +10193,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -10211,7 +10224,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -10655,6 +10667,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -10684,7 +10697,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg: @@ -10699,6 +10711,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -10729,7 +10742,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024 @@ -11144,22 +11156,23 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_not_b32_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_not_b32_e32 v5, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_sub_f32_e32 v3, v3, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_and_b32_e32 v7, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v8, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 @@ -11185,23 +11198,24 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3 -; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_not_b32_e32 v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 3, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 +; GFX6-NEXT: v_not_b32_e32 v5, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v6 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_sub_f32_e32 v3, v3, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, v4, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_and_b32_e32 v7, v4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v8, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 @@ -11643,6 +11657,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -11685,6 +11700,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -12144,6 +12160,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -12186,6 +12203,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -12566,30 +12584,31 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, v4 +; GFX7-NEXT: v_mov_b32_e32 v4, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB38_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4: @@ -12600,31 +12619,32 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v4, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB38_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX6-NEXT: v_mov_b32_e32 v0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -12964,6 +12984,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 @@ -12997,6 +13018,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 @@ -13467,6 +13489,7 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -13496,7 +13519,6 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos: @@ -13511,6 +13533,7 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -13541,7 +13564,6 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023 @@ -13975,6 +13997,7 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 @@ -14017,6 +14040,7 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll index 53cfd12a953d3..fb5674310442d 100644 --- a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll @@ -13,7 +13,7 @@ define bfloat @v_uitofp_i1_to_bf16(i1 %num) { ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uitofp_i1_to_bf16: @@ -1337,7 +1337,7 @@ define bfloat @v_sitofp_i1_to_bf16(i1 %num) { ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_sitofp_i1_to_bf16: diff --git a/llvm/test/CodeGen/AMDGPU/integer-canonicalizing-src-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-canonicalizing-src-modifiers.ll index 7b356d26d608a..423ee839b06ba 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-canonicalizing-src-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-canonicalizing-src-modifiers.ll @@ -94,7 +94,6 @@ define half @v_uitofp_i16_to_f16_abs(i16 %arg0) nounwind { ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uitofp_i16_to_f16_abs: @@ -131,7 +130,6 @@ define half @v_uitofp_i16_to_f16_neg(i16 %arg0) nounwind { ; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uitofp_i16_to_f16_neg: @@ -168,7 +166,6 @@ define half @s_uitofp_i16_to_f16_abs(i16 inreg %arg0) nounwind { ; GFX7-NEXT: s_and_b32 s4, s16, 0x7fff ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: s_uitofp_i16_to_f16_abs: @@ -205,7 +202,6 @@ define half @s_uitofp_i16_to_f16_neg(i16 inreg %arg0) nounwind { ; GFX7-NEXT: s_and_b32 s4, s16, 0x8000 ; GFX7-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: s_uitofp_i16_to_f16_neg: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll index 6e887f54de861..f2bb5a4aadee5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll @@ -12,7 +12,6 @@ define bfloat @raw_ptr_buffer_load_bf16(ptr addrspace(8) inreg %rsrc) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_load_ushort v0, off, s[16:19], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: raw_ptr_buffer_load_bf16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll index 395de3d4e2379..60c3d8d8734f6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll @@ -9,8 +9,6 @@ define amdgpu_ps void @buffer_store_bf16(ptr addrspace(8) inreg %rsrc, bfloat %data, i32 %offset) { ; GFX7-LABEL: buffer_store_bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index 33b644181af52..ee01c9d0acdc7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -5527,16 +5527,17 @@ define float @v_exp_f32_from_fpext_bf16(bfloat %src) { ; SI-LABEL: v_exp_f32_from_fpext_bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_mov_b32 s4, 0x3fb8aa3b -; SI-NEXT: v_rndne_f32_e32 v2, v1 -; SI-NEXT: v_sub_f32_e32 v3, v1, v2 -; SI-NEXT: v_fma_f32 v1, v0, s4, -v1 +; SI-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 +; SI-NEXT: v_fma_f32 v2, v0, s4, -v1 ; SI-NEXT: s_mov_b32 s4, 0x32a5705f -; SI-NEXT: v_fma_f32 v1, v0, s4, v1 -; SI-NEXT: v_add_f32_e32 v1, v3, v1 +; SI-NEXT: v_rndne_f32_e32 v3, v1 +; SI-NEXT: v_fma_f32 v2, v0, s4, v2 +; SI-NEXT: v_sub_f32_e32 v1, v1, v3 +; SI-NEXT: v_add_f32_e32 v1, v1, v2 ; SI-NEXT: v_exp_f32_e32 v1, v1 -; SI-NEXT: v_cvt_i32_f32_e32 v2, v2 +; SI-NEXT: v_cvt_i32_f32_e32 v2, v3 ; SI-NEXT: s_mov_b32 s4, 0xc2ce8ed0 ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 ; SI-NEXT: s_mov_b32 s4, 0x42b17218 @@ -5805,25 +5806,14 @@ define half @v_exp_f16(half %in) { ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; SI-SDAG-LABEL: v_exp_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-NEXT: v_exp_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_f16: ; R600: ; %bb.0: @@ -5848,25 +5838,14 @@ define half @v_exp_fabs_f16(half %in) { ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; SI-SDAG-LABEL: v_exp_fabs_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp_fabs_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp_fabs_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SI-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-NEXT: v_exp_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_fabs_f16: ; R600: ; %bb.0: @@ -5904,12 +5883,10 @@ define half @v_exp_fneg_fabs_f16(half %in) { ; SI-SDAG-LABEL: v_exp_fneg_fabs_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0xbfb8aa3b, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_fneg_fabs_f16: @@ -5958,12 +5935,10 @@ define half @v_exp_fneg_f16(half %in) { ; SI-SDAG-LABEL: v_exp_fneg_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0xbfb8aa3b, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_fneg_f16: @@ -6000,10 +5975,10 @@ define half @v_exp_f16_fast(half %in) { ; SI-SDAG-LABEL: v_exp_f16_fast: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp_f16_fast: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index de1f2e900e326..7d830a9306293 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -5589,16 +5589,17 @@ define float @v_exp10_f32_from_fpext_bf16(bfloat %src) { ; SI-LABEL: v_exp10_f32_from_fpext_bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_mov_b32 s4, 0x40549a78 -; SI-NEXT: v_rndne_f32_e32 v2, v1 -; SI-NEXT: v_sub_f32_e32 v3, v1, v2 -; SI-NEXT: v_fma_f32 v1, v0, s4, -v1 +; SI-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 +; SI-NEXT: v_fma_f32 v2, v0, s4, -v1 ; SI-NEXT: s_mov_b32 s4, 0x33979a37 -; SI-NEXT: v_fma_f32 v1, v0, s4, v1 -; SI-NEXT: v_add_f32_e32 v1, v3, v1 +; SI-NEXT: v_rndne_f32_e32 v3, v1 +; SI-NEXT: v_fma_f32 v2, v0, s4, v2 +; SI-NEXT: v_sub_f32_e32 v1, v1, v3 +; SI-NEXT: v_add_f32_e32 v1, v1, v2 ; SI-NEXT: v_exp_f32_e32 v1, v1 -; SI-NEXT: v_cvt_i32_f32_e32 v2, v2 +; SI-NEXT: v_cvt_i32_f32_e32 v2, v3 ; SI-NEXT: s_mov_b32 s4, 0xc23369f4 ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 ; SI-NEXT: s_mov_b32 s4, 0x421a209b @@ -5876,25 +5877,14 @@ define half @v_exp10_f16(half %in) { ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; SI-SDAG-LABEL: v_exp10_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp10_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp10_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 +; SI-NEXT: v_exp_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp10_f16: ; R600: ; %bb.0: @@ -5919,25 +5909,14 @@ define half @v_exp10_fabs_f16(half %in) { ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; SI-SDAG-LABEL: v_exp10_fabs_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp10_fabs_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp10_fabs_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SI-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 +; SI-NEXT: v_exp_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp10_fabs_f16: ; R600: ; %bb.0: @@ -5975,12 +5954,10 @@ define half @v_exp10_fneg_fabs_f16(half %in) { ; SI-SDAG-LABEL: v_exp10_fneg_fabs_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0xc0549a78, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp10_fneg_fabs_f16: @@ -6029,12 +6006,10 @@ define half @v_exp10_fneg_f16(half %in) { ; SI-SDAG-LABEL: v_exp10_fneg_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0 +; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0xc0549a78, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp10_fneg_f16: @@ -6074,13 +6049,13 @@ define half @v_exp10_f16_fast(half %in) { ; SI-SDAG-LABEL: v_exp10_f16_fast: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3a278000, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40548000, v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp10_f16_fast: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index 390fedb1d2ef3..97ecb5362a4bc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -2828,11 +2828,12 @@ define float @v_exp2_f32_from_fpext_bf16(bfloat %src) { ; SI-LABEL: v_exp2_f32_from_fpext_bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_mov_b32 s4, 0xc2fc0000 +; SI-NEXT: v_mov_b32_e32 v1, 0x42800000 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-NEXT: v_mov_b32_e32 v2, 0x42800000 -; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; SI-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-NEXT: v_exp_f32_e32 v0, v0 ; SI-NEXT: v_not_b32_e32 v1, 63 ; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -2896,23 +2897,13 @@ define float @v_exp2_f32_from_fpext_bf16(bfloat %src) { ; FIXME: Fold out fp16_to_fp (FP_TO_FP16) on no-f16 targets define half @v_exp2_f16(half %in) { -; SI-SDAG-LABEL: v_exp2_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp2_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp2_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_exp_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_exp2_f16: ; VI: ; %bb.0: @@ -2945,23 +2936,13 @@ define half @v_exp2_f16(half %in) { } define half @v_exp2_fabs_f16(half %in) { -; SI-SDAG-LABEL: v_exp2_fabs_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp2_fabs_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp2_fabs_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SI-NEXT: v_exp_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_exp2_fabs_f16: ; VI: ; %bb.0: @@ -2998,11 +2979,10 @@ define half @v_exp2_fneg_fabs_f16(half %in) { ; SI-SDAG-LABEL: v_exp2_fneg_fabs_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -|v0| ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp2_fneg_fabs_f16: @@ -3049,11 +3029,10 @@ define half @v_exp2_fneg_f16(half %in) { ; SI-SDAG-LABEL: v_exp2_fneg_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_exp2_fneg_f16: @@ -3096,21 +3075,13 @@ define half @v_exp2_fneg_f16(half %in) { } define half @v_exp2_f16_fast(half %in) { -; SI-SDAG-LABEL: v_exp2_f16_fast: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp2_f16_fast: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp2_f16_fast: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_exp_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_exp2_f16_fast: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll index c562eb168478f..7300e4227925f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll @@ -19,14 +19,14 @@ define { half, i32 } @test_frexp_f16_i32(half %a) { ; GFX6-SDAG-LABEL: test_frexp_f16_i32: ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-SDAG-NEXT: v_frexp_mant_f32_e32 v1, v0 -; GFX6-SDAG-NEXT: v_frexp_exp_i32_f32_e32 v2, v0 +; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-SDAG-NEXT: v_frexp_mant_f32_e32 v2, v0 ; GFX6-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX6-SDAG-NEXT: v_frexp_exp_i32_f32_e32 v1, v0 +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_frexp_f16_i32: @@ -155,12 +155,12 @@ define half @test_frexp_f16_i32_only_use_fract(half %a) { ; GFX6-SDAG-LABEL: test_frexp_f16_i32_only_use_fract: ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; GFX6-SDAG-NEXT: v_frexp_mant_f32_e32 v1, v0 ; GFX6-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 ; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_frexp_f16_i32_only_use_fract: @@ -258,9 +258,8 @@ define i32 @test_frexp_f16_i32_only_use_exp(half %a) { ; GFX6-SDAG-LABEL: test_frexp_f16_i32_only_use_exp: ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; GFX6-SDAG-NEXT: v_frexp_exp_i32_f32_e32 v1, v0 ; GFX6-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 ; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc @@ -930,14 +929,14 @@ define { half, i16 } @test_frexp_f16_i16(half %a) { ; GFX6-SDAG-LABEL: test_frexp_f16_i16: ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-SDAG-NEXT: v_frexp_mant_f32_e32 v1, v0 -; GFX6-SDAG-NEXT: v_frexp_exp_i32_f32_e32 v2, v0 +; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-SDAG-NEXT: v_frexp_mant_f32_e32 v2, v0 ; GFX6-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GFX6-SDAG-NEXT: v_frexp_exp_i32_f32_e32 v1, v0 +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_frexp_f16_i16: @@ -1064,12 +1063,12 @@ define half @test_frexp_f16_i16_only_use_fract(half %a) { ; GFX6-SDAG-LABEL: test_frexp_f16_i16_only_use_fract: ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; GFX6-SDAG-NEXT: v_frexp_mant_f32_e32 v1, v0 ; GFX6-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 ; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_frexp_f16_i16_only_use_fract: @@ -1167,9 +1166,8 @@ define i16 @test_frexp_f16_i16_only_use_exp(half %a) { ; GFX6-SDAG-LABEL: test_frexp_f16_i16_only_use_exp: ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 ; GFX6-SDAG-NEXT: v_frexp_exp_i32_f32_e32 v1, v0 ; GFX6-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 ; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll index e94a2813f2ecc..46cd8c07345ce 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll @@ -161,8 +161,7 @@ define i1 @snan_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: snan_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7fc0 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 @@ -232,8 +231,7 @@ define i1 @qnan_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: qnan_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7fbf ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -288,8 +286,7 @@ define i1 @posinf_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: posinf_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -339,8 +336,7 @@ define i1 @neginf_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: neginf_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7CHECK-NEXT: s_mov_b32 s4, 0xff80 ; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -390,9 +386,8 @@ define i1 @posnormal_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: posnormal_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v1, 16, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, 0xffffff80, v0 ; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s6, 0x7f00 @@ -466,9 +461,8 @@ define i1 @negnormal_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: negnormal_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v1, 16, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, 0xffffff80, v0 ; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s6, 0x7f00 @@ -542,8 +536,6 @@ define i1 @possubnormal_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: possubnormal_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, -1, v0 ; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f @@ -600,9 +592,8 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: negsubnormal_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v1, 16, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: v_add_i32_e64 v0, s[4:5], -1, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 @@ -675,8 +666,7 @@ define i1 @poszero_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: poszero_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7CHECK-NEXT: s_setpc_b64 s[30:31] @@ -723,8 +713,7 @@ define i1 @negzero_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: negzero_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7CHECK-NEXT: s_mov_b32 s4, 0x8000 ; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -774,8 +763,7 @@ define i1 @posfinite_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: posfinite_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -825,9 +813,8 @@ define i1 @negfinite_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: negfinite_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v1, 16, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e64 s[4:5], s4, v0 @@ -894,8 +881,7 @@ define i1 @isnan_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: isnan_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -950,8 +936,7 @@ define i1 @not_isnan_bf16(bfloat %x) { ; GFX7CHECK-LABEL: not_isnan_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f81 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -1276,8 +1261,7 @@ define i1 @isinf_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: isinf_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -1332,8 +1316,7 @@ define i1 @isfinite_bf16(bfloat %x) nounwind { ; GFX7CHECK-LABEL: isfinite_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -1388,8 +1371,6 @@ define i1 @issubnormal_or_zero_bf16(bfloat %x) { ; GFX7CHECK-LABEL: issubnormal_or_zero_bf16: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7f80, v0 ; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -1443,8 +1424,6 @@ define i1 @not_issubnormal_or_zero_bf16(bfloat %x) { ; GFX7CHECK-LABEL: not_issubnormal_or_zero_bf16: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7f80, v0 ; GFX7CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -1498,8 +1477,7 @@ define i1 @isnormal_bf16(bfloat %x) { ; GFX7CHECK-LABEL: isnormal_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, 0xffffff80, v0 ; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f00 @@ -1561,8 +1539,7 @@ define i1 @not_isnormal_bf16(bfloat %x) { ; GFX7CHECK-LABEL: not_isnormal_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, 0xffffff80, v0 ; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7eff @@ -1624,9 +1601,8 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) { ; GFX7CHECK-LABEL: not_is_plus_normal_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v1, 16, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, 0xffffff80, v0 ; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s6, 0x7eff @@ -1700,9 +1676,8 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) { ; GFX7CHECK-LABEL: not_is_neg_normal_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v1, 16, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, 0xffffff80, v0 ; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s6, 0x7eff @@ -1776,8 +1751,7 @@ define i1 @issubnormal_bf16(bfloat %x) { ; GFX7CHECK-LABEL: issubnormal_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, -1, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f ; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 @@ -1838,8 +1812,7 @@ define i1 @not_issubnormal_bf16(bfloat %x) { ; GFX7CHECK-LABEL: not_issubnormal_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, -1, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7e ; GFX7CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0 @@ -1900,8 +1873,7 @@ define i1 @iszero_bf16(bfloat %x) { ; GFX7CHECK-LABEL: iszero_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7CHECK-NEXT: s_setpc_b64 s[30:31] @@ -1953,8 +1925,7 @@ define i1 @not_iszero_bf16(bfloat %x) { ; GFX7CHECK-LABEL: not_iszero_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7CHECK-NEXT: s_setpc_b64 s[30:31] @@ -2006,8 +1977,7 @@ define i1 @ispositive_bf16(bfloat %x) { ; GFX7CHECK-LABEL: ispositive_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f81 ; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -2057,10 +2027,9 @@ define i1 @not_ispositive_bf16(bfloat %x) { ; GFX7CHECK-LABEL: not_ispositive_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v2, 16, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX7CHECK-NEXT: v_bfe_i32 v2, v0, 0, 16 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s6, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e64 s[4:5], s6, v0 @@ -2154,10 +2123,9 @@ define i1 @isnegative_bf16(bfloat %x) { ; GFX7CHECK-LABEL: isnegative_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v2, 16, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX7CHECK-NEXT: v_bfe_i32 v2, v0, 0, 16 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e64 s[4:5], s4, v0 @@ -2239,9 +2207,8 @@ define i1 @not_isnegative_bf16(bfloat %x) { ; GFX7CHECK-LABEL: not_isnegative_bf16: ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f81 @@ -2311,8 +2278,7 @@ define i1 @iszero_or_nan_bf16(bfloat %x) { ; GFX7CHECK-LABEL: iszero_or_nan_bf16: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 @@ -2380,8 +2346,7 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 { ; GFX7CHECK-LABEL: iszero_or_nan_f_daz: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 @@ -2449,8 +2414,7 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 { ; GFX7CHECK-LABEL: iszero_or_nan_f_maybe_daz: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 @@ -2518,8 +2482,7 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) { ; GFX7CHECK-LABEL: not_iszero_or_nan_bf16: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f81 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 @@ -2587,8 +2550,7 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 { ; GFX7CHECK-LABEL: not_iszero_or_nan_f_daz: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f81 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 @@ -2656,8 +2618,7 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 { ; GFX7CHECK-LABEL: not_iszero_or_nan_f_maybe_daz: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f81 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 @@ -2725,8 +2686,7 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) { ; GFX7CHECK-LABEL: iszero_or_qnan_bf16: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7fbf ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 @@ -2794,8 +2754,7 @@ define i1 @iszero_or_snan_bf16(bfloat %x) { ; GFX7CHECK-LABEL: iszero_or_snan_bf16: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7fc0 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 @@ -2878,8 +2837,7 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) { ; GFX7CHECK-LABEL: not_iszero_or_qnan_bf16: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7fc0 ; GFX7CHECK-NEXT: s_movk_i32 s8, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 @@ -3005,8 +2963,7 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) { ; GFX7CHECK-LABEL: not_iszero_or_snan_bf16: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: v_add_i32_e64 v1, s[4:5], -1, v0 @@ -3120,8 +3077,7 @@ define i1 @isinf_or_nan_bf16(bfloat %x) { ; GFX7CHECK-LABEL: isinf_or_nan_bf16: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f7f ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -3177,8 +3133,7 @@ define i1 @not_isinf_or_nan_bf16(bfloat %x) { ; GFX7CHECK-LABEL: not_isinf_or_nan_bf16: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -3234,8 +3189,7 @@ define i1 @isfinite_or_nan_f(bfloat %x) { ; GFX7CHECK-LABEL: isfinite_or_nan_f: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -3291,8 +3245,7 @@ define i1 @not_isfinite_or_nan_f(bfloat %x) { ; GFX7CHECK-LABEL: not_isfinite_or_nan_f: ; GFX7CHECK: ; %bb.0: ; %entry ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll index a7b6e5877adf4..3863da4fa6389 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll @@ -266,12 +266,11 @@ define i1 @snan_f16(half %x) nounwind { ; GFX7SELDAG-LABEL: snan_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7e00 -; GFX7SELDAG-NEXT: s_movk_i32 s5, 0x7c00 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7e00 ; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 -; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], s5, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 +; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], s4, v0 ; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] @@ -345,9 +344,8 @@ define i1 @qnan_f16(half %x) nounwind { ; GFX7SELDAG-LABEL: qnan_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7dff ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7dff ; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] @@ -420,7 +418,7 @@ define i1 @posinf_f16(half %x) nounwind { ; GFX7SELDAG-LABEL: posinf_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -495,7 +493,7 @@ define i1 @neginf_f16(half %x) nounwind { ; GFX7SELDAG-LABEL: neginf_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7SELDAG-NEXT: s_mov_b32 s4, 0xfc00 ; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -568,12 +566,11 @@ define i1 @posnormal_f16(half %x) nounwind { ; GFX7SELDAG-LABEL: posnormal_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x7800 ; GFX7SELDAG-NEXT: v_bfe_i32 v1, v0, 0, 16 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x7800 ; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v1 ; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0 ; GFX7SELDAG-NEXT: s_and_b64 s[4:5], vcc, s[4:5] @@ -655,12 +652,11 @@ define i1 @negnormal_f16(half %x) nounwind { ; GFX7SELDAG-LABEL: negnormal_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x7800 ; GFX7SELDAG-NEXT: v_bfe_i32 v1, v0, 0, 16 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x7800 ; GFX7SELDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1 ; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0 ; GFX7SELDAG-NEXT: s_and_b64 s[4:5], vcc, s[4:5] @@ -740,10 +736,9 @@ define i1 @possubnormal_f16(half %x) nounwind { ; GFX7SELDAG-LABEL: possubnormal_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x3ff ; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, -1, v0 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x3ff ; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] @@ -818,7 +813,6 @@ define i1 @negsubnormal_f16(half %x) nounwind { ; GFX7SELDAG-LABEL: negsubnormal_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7SELDAG-NEXT: v_bfe_i32 v1, v0, 0, 16 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_add_i32_e64 v0, s[4:5], -1, v0 @@ -899,21 +893,13 @@ define i1 @negsubnormal_f16(half %x) nounwind { } define i1 @poszero_f16(half %x) nounwind { -; GFX7SELDAG-LABEL: poszero_f16: -; GFX7SELDAG: ; %bb.0: -; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7GLISEL-LABEL: poszero_f16: -; GFX7GLISEL: ; %bb.0: -; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7CHECK-LABEL: poszero_f16: +; GFX7CHECK: ; %bb.0: +; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX8CHECK-LABEL: poszero_f16: ; GFX8CHECK: ; %bb.0: @@ -973,7 +959,7 @@ define i1 @negzero_f16(half %x) nounwind { ; GFX7SELDAG-LABEL: negzero_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7SELDAG-NEXT: s_mov_b32 s4, 0x8000 ; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -1046,7 +1032,7 @@ define i1 @posfinite_f16(half %x) nounwind { ; GFX7SELDAG-LABEL: posfinite_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -1121,10 +1107,9 @@ define i1 @negfinite_f16(half %x) nounwind { ; GFX7SELDAG-LABEL: negfinite_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_bfe_i32 v1, v0, 0, 16 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 ; GFX7SELDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], s4, v0 ; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc @@ -1202,9 +1187,8 @@ define i1 @isnan_f16(half %x) nounwind { ; GFX7SELDAG-LABEL: isnan_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] @@ -1277,9 +1261,8 @@ define i1 @not_isnan_f16(half %x) { ; GFX7SELDAG-LABEL: not_isnan_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01 ; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] @@ -1859,12 +1842,8 @@ define i1 @isnan_f16_strictfp(half %x) strictfp nounwind { ; GFX7SELDAG-LABEL: isnan_f16_strictfp: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 -; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7SELDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] @@ -1937,9 +1916,8 @@ define i1 @isinf_f16(half %x) nounwind { ; GFX7SELDAG-LABEL: isinf_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] @@ -2014,9 +1992,8 @@ define i1 @isfinite_f16(half %x) nounwind { ; GFX7SELDAG-LABEL: isfinite_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] @@ -2091,7 +2068,6 @@ define i1 @issubnormal_or_zero_f16(half %x) { ; GFX7SELDAG-LABEL: issubnormal_or_zero_f16: ; GFX7SELDAG: ; %bb.0: ; %entry ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7c00, v0 ; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -2167,7 +2143,6 @@ define i1 @not_issubnormal_or_zero_f16(half %x) { ; GFX7SELDAG-LABEL: not_issubnormal_or_zero_f16: ; GFX7SELDAG: ; %bb.0: ; %entry ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7c00, v0 ; GFX7SELDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -2249,11 +2224,10 @@ define i1 @isnormal_f16(half %x) { ; GFX7SELDAG-LABEL: isnormal_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7800 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7800 ; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] @@ -2329,11 +2303,10 @@ define i1 @not_isnormal_f16(half %x) { ; GFX7SELDAG-LABEL: not_isnormal_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x77ff ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x77ff ; GFX7SELDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] @@ -2414,12 +2387,11 @@ define i1 @not_is_plus_normal_f16(half %x) { ; GFX7SELDAG-LABEL: not_is_plus_normal_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x77ff ; GFX7SELDAG-NEXT: v_bfe_i32 v1, v0, 0, 16 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x77ff ; GFX7SELDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1 ; GFX7SELDAG-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0 ; GFX7SELDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -2510,12 +2482,11 @@ define i1 @not_is_neg_normal_f16(half %x) { ; GFX7SELDAG-LABEL: not_is_neg_normal_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x77ff ; GFX7SELDAG-NEXT: v_bfe_i32 v1, v0, 0, 16 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x77ff ; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v1 ; GFX7SELDAG-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0 ; GFX7SELDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -2606,10 +2577,9 @@ define i1 @issubnormal_f16(half %x) { ; GFX7SELDAG-LABEL: issubnormal_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x3ff ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x3ff ; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] @@ -2685,10 +2655,9 @@ define i1 @not_issubnormal_f16(half %x) { ; GFX7SELDAG-LABEL: not_issubnormal_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x3fe ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x3fe ; GFX7SELDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] @@ -2772,7 +2741,6 @@ define i1 @iszero_f16(half %x) { ; GFX7SELDAG-LABEL: iszero_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -2847,7 +2815,6 @@ define i1 @not_iszero_f16(half %x) { ; GFX7SELDAG-LABEL: not_iszero_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -2935,7 +2902,7 @@ define i1 @ispositive_f16(half %x) { ; GFX7SELDAG-LABEL: ispositive_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01 ; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -3010,17 +2977,17 @@ define i1 @not_ispositive_f16(half %x) { ; GFX7SELDAG-LABEL: not_ispositive_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7SELDAG-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX7SELDAG-NEXT: v_bfe_i32 v2, v0, 0, 16 +; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x7c00 +; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 +; GFX7SELDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], s6, v0 ; GFX7SELDAG-NEXT: s_mov_b32 s7, 0xfc00 -; GFX7SELDAG-NEXT: v_bfe_i32 v1, v0, 0, 16 -; GFX7SELDAG-NEXT: v_and_b32_e32 v2, 0x7fff, v0 -; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 -; GFX7SELDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], s6, v2 ; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s7, v0 +; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s7, v1 ; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s6, v2 +; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s6, v0 ; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] @@ -3101,15 +3068,15 @@ define i1 @isnegative_f16(half %x) { ; GFX7SELDAG-LABEL: isnegative_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7SELDAG-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX7SELDAG-NEXT: v_bfe_i32 v2, v0, 0, 16 +; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 +; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 +; GFX7SELDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], s4, v0 ; GFX7SELDAG-NEXT: s_mov_b32 s6, 0xfc00 -; GFX7SELDAG-NEXT: v_bfe_i32 v1, v0, 0, 16 -; GFX7SELDAG-NEXT: v_and_b32_e32 v2, 0x7fff, v0 -; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 -; GFX7SELDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], s4, v2 ; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s6, v0 +; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1 ; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] @@ -3188,13 +3155,13 @@ define i1 @not_isnegative_f16(half %x) { ; GFX7SELDAG-LABEL: not_isnegative_f16: ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01 -; GFX7SELDAG-NEXT: s_movk_i32 s5, 0x7c00 -; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 +; GFX7SELDAG-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], s5, v0 -; GFX7SELDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 +; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01 +; GFX7SELDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v1 +; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -3272,9 +3239,8 @@ define i1 @iszero_or_nan_f16(half %x) { ; GFX7SELDAG-LABEL: iszero_or_nan_f16: ; GFX7SELDAG: ; %bb.0: ; %entry ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc @@ -3353,9 +3319,8 @@ define i1 @iszero_or_nan_f_daz(half %x) #0 { ; GFX7SELDAG-LABEL: iszero_or_nan_f_daz: ; GFX7SELDAG: ; %bb.0: ; %entry ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc @@ -3434,9 +3399,8 @@ define i1 @iszero_or_nan_f_maybe_daz(half %x) #1 { ; GFX7SELDAG-LABEL: iszero_or_nan_f_maybe_daz: ; GFX7SELDAG: ; %bb.0: ; %entry ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc @@ -3515,9 +3479,8 @@ define i1 @not_iszero_or_nan_f16(half %x) { ; GFX7SELDAG-LABEL: not_iszero_or_nan_f16: ; GFX7SELDAG: ; %bb.0: ; %entry ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01 ; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 ; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc @@ -3605,9 +3568,8 @@ define i1 @not_iszero_or_nan_f_daz(half %x) #0 { ; GFX7SELDAG-LABEL: not_iszero_or_nan_f_daz: ; GFX7SELDAG: ; %bb.0: ; %entry ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01 ; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 ; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc @@ -3695,9 +3657,8 @@ define i1 @not_iszero_or_nan_f_maybe_daz(half %x) #1 { ; GFX7SELDAG-LABEL: not_iszero_or_nan_f_maybe_daz: ; GFX7SELDAG: ; %bb.0: ; %entry ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01 ; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 ; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc @@ -3785,9 +3746,8 @@ define i1 @iszero_or_qnan_f16(half %x) { ; GFX7SELDAG-LABEL: iszero_or_qnan_f16: ; GFX7SELDAG: ; %bb.0: ; %entry ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7dff ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7dff ; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc @@ -3866,12 +3826,11 @@ define i1 @iszero_or_snan_f16(half %x) { ; GFX7SELDAG-LABEL: iszero_or_snan_f16: ; GFX7SELDAG: ; %bb.0: ; %entry ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7e00 -; GFX7SELDAG-NEXT: s_movk_i32 s5, 0x7c00 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7e00 ; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 -; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], s5, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 +; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], s4, v0 ; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7SELDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -3952,10 +3911,9 @@ define i1 @not_iszero_or_qnan_f16(half %x) { ; GFX7SELDAG-LABEL: not_iszero_or_qnan_f16: ; GFX7SELDAG: ; %bb.0: ; %entry ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7e00 ; GFX7SELDAG-NEXT: s_movk_i32 s8, 0x7c00 -; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], s8, v0 ; GFX7SELDAG-NEXT: s_and_b64 s[6:7], s[4:5], vcc @@ -4058,14 +4016,13 @@ define i1 @not_iszero_or_snan_f16(half %x) { ; GFX7SELDAG-LABEL: not_iszero_or_snan_f16: ; GFX7SELDAG: ; %bb.0: ; %entry ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 -; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x7dff ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_add_i32_e64 v1, s[4:5], -1, v0 ; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x3ff ; GFX7SELDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v1 +; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x7dff ; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s6, v0 ; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc @@ -4161,9 +4118,8 @@ define i1 @isinf_or_nan_f16(half %x) { ; GFX7SELDAG-LABEL: isinf_or_nan_f16: ; GFX7SELDAG: ; %bb.0: ; %entry ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7bff ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7bff ; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] @@ -4239,9 +4195,8 @@ define i1 @not_isinf_or_nan_f16(half %x) { ; GFX7SELDAG-LABEL: not_isinf_or_nan_f16: ; GFX7SELDAG: ; %bb.0: ; %entry ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] @@ -4317,9 +4272,8 @@ define i1 @isfinite_or_nan_f(half %x) { ; GFX7SELDAG-LABEL: isfinite_or_nan_f: ; GFX7SELDAG: ; %bb.0: ; %entry ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] @@ -4395,9 +4349,8 @@ define i1 @not_isfinite_or_nan_f(half %x) { ; GFX7SELDAG-LABEL: not_isfinite_or_nan_f: ; GFX7SELDAG: ; %bb.0: ; %entry ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll index 749600b4a99f7..8c4d4788c4bdf 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll @@ -192,14 +192,14 @@ define <2 x double> @test_ldexp_v2f64_v2i32(<2 x double> %a, <2 x i32> %b) { ; } define half @test_ldexp_f16_i8(half %a, i8 %b) { -; GFX6-SDAG-LABEL: test_ldexp_f16_i8: -; GFX6-SDAG: ; %bb.0: -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: test_ldexp_f16_i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_ldexp_f16_i8: ; GFX8-SDAG: ; %bb.0: @@ -229,15 +229,6 @@ define half @test_ldexp_f16_i8(half %a, i8 %b) { ; GFX11-SDAG-FAKE16-NEXT: v_ldexp_f16_e32 v0, v0, v1 ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-GISEL-LABEL: test_ldexp_f16_i8: -; GFX6-GISEL: ; %bb.0: -; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] -; ; GFX8-GISEL-LABEL: test_ldexp_f16_i8: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -282,14 +273,14 @@ define half @test_ldexp_f16_i8(half %a, i8 %b) { } define half @test_ldexp_f16_i16(half %a, i16 %b) { -; GFX6-SDAG-LABEL: test_ldexp_f16_i16: -; GFX6-SDAG: ; %bb.0: -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: test_ldexp_f16_i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_ldexp_f16_i16: ; GFX8: ; %bb.0: @@ -315,15 +306,6 @@ define half @test_ldexp_f16_i16(half %a, i16 %b) { ; GFX11-SDAG-FAKE16-NEXT: v_ldexp_f16_e32 v0, v0, v1 ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-GISEL-LABEL: test_ldexp_f16_i16: -; GFX6-GISEL: ; %bb.0: -; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] -; ; GFX11-GISEL-TRUE16-LABEL: test_ldexp_f16_i16: ; GFX11-GISEL-TRUE16: ; %bb.0: ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -340,13 +322,13 @@ define half @test_ldexp_f16_i16(half %a, i16 %b) { } define half @test_ldexp_f16_i32(half %a, i32 %b) { -; GFX6-SDAG-LABEL: test_ldexp_f16_i32: -; GFX6-SDAG: ; %bb.0: -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: test_ldexp_f16_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_ldexp_f16_i32: ; GFX8-SDAG: ; %bb.0: @@ -384,14 +366,6 @@ define half @test_ldexp_f16_i32(half %a, i32 %b) { ; GFX11-SDAG-FAKE16-NEXT: v_ldexp_f16_e32 v0, v0, v1 ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-GISEL-LABEL: test_ldexp_f16_i32: -; GFX6-GISEL: ; %bb.0: -; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] -; ; GFX8-GISEL-LABEL: test_ldexp_f16_i32: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1313,13 +1287,13 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) { } define amdgpu_ps half @test_ldexp_f16_i16_uniform(half inreg %a, i16 inreg %b) { -; GFX6-SDAG-LABEL: test_ldexp_f16_i16_uniform: -; GFX6-SDAG: ; %bb.0: -; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, s0 -; GFX6-SDAG-NEXT: s_sext_i32_i16 s0, s1 -; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-SDAG-NEXT: v_ldexp_f32_e64 v0, v0, s0 -; GFX6-SDAG-NEXT: ; return to shader part epilog +; GFX6-LABEL: test_ldexp_f16_i16_uniform: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0 +; GFX6-NEXT: s_sext_i32_i16 s0, s1 +; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: test_ldexp_f16_i16_uniform: ; GFX8: ; %bb.0: @@ -1343,14 +1317,6 @@ define amdgpu_ps half @test_ldexp_f16_i16_uniform(half inreg %a, i16 inreg %b) { ; GFX11-SDAG-FAKE16-NEXT: v_ldexp_f16_e64 v0, s0, s1 ; GFX11-SDAG-FAKE16-NEXT: ; return to shader part epilog ; -; GFX6-GISEL-LABEL: test_ldexp_f16_i16_uniform: -; GFX6-GISEL: ; %bb.0: -; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX6-GISEL-NEXT: s_sext_i32_i16 s0, s1 -; GFX6-GISEL-NEXT: v_ldexp_f32_e64 v0, v0, s0 -; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-GISEL-NEXT: ; return to shader part epilog -; ; GFX11-GISEL-TRUE16-LABEL: test_ldexp_f16_i16_uniform: ; GFX11-GISEL-TRUE16: ; %bb.0: ; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e64 v0.l, s0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index 59c1c2facb5c9..4e8ffdcb00310 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -6110,6 +6110,7 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) { ; SI-LABEL: v_log_f32_from_fpext_bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_mov_b32 s4, 0x800000 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc @@ -6287,25 +6288,14 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) { } define half @v_log_f16(half %in) { -; SI-SDAG-LABEL: v_log_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_log_f16: ; VI: ; %bb.0: @@ -6367,25 +6357,14 @@ define half @v_log_f16(half %in) { } define half @v_log_fabs_f16(half %in) { -; SI-SDAG-LABEL: v_log_fabs_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log_fabs_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log_fabs_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_log_fabs_f16: ; VI: ; %bb.0: @@ -6451,12 +6430,11 @@ define half @v_log_fneg_fabs_f16(half %in) { ; SI-SDAG-LABEL: v_log_fneg_fabs_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -|v0| ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log_fneg_fabs_f16: @@ -6533,12 +6511,11 @@ define half @v_log_fneg_f16(half %in) { ; SI-SDAG-LABEL: v_log_fneg_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log_fneg_f16: @@ -6611,23 +6588,14 @@ define half @v_log_fneg_f16(half %in) { } define half @v_log_f16_fast(half %in) { -; SI-SDAG-LABEL: v_log_f16_fast: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log_f16_fast: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log_f16_fast: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_log_f16_fast: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index 2dc85d3c161a0..843b829f28742 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -6110,6 +6110,7 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) { ; SI-LABEL: v_log10_f32_from_fpext_bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_mov_b32 s4, 0x800000 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc @@ -6287,25 +6288,14 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) { } define half @v_log10_f16(half %in) { -; SI-SDAG-LABEL: v_log10_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log10_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log10_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_log10_f16: ; VI: ; %bb.0: @@ -6367,25 +6357,14 @@ define half @v_log10_f16(half %in) { } define half @v_log10_fabs_f16(half %in) { -; SI-SDAG-LABEL: v_log10_fabs_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log10_fabs_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log10_fabs_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_log10_fabs_f16: ; VI: ; %bb.0: @@ -6451,12 +6430,11 @@ define half @v_log10_fneg_fabs_f16(half %in) { ; SI-SDAG-LABEL: v_log10_fneg_fabs_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -|v0| ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log10_fneg_fabs_f16: @@ -6533,12 +6511,11 @@ define half @v_log10_fneg_f16(half %in) { ; SI-SDAG-LABEL: v_log10_fneg_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log10_fneg_f16: @@ -6611,23 +6588,14 @@ define half @v_log10_fneg_f16(half %in) { } define half @v_log10_f16_fast(half %in) { -; SI-SDAG-LABEL: v_log10_f16_fast: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log10_f16_fast: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log10_f16_fast: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_log10_f16_fast: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index 047cc9addbcfc..35ae1337d8e76 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -3622,10 +3622,11 @@ define float @v_log2_f32_from_fpext_bf16(bfloat %src) { ; SI-LABEL: v_log2_f32_from_fpext_bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_mov_b32 s4, 0x800000 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc -; SI-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; SI-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; SI-NEXT: v_log_f32_e32 v0, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0x42000000 ; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc @@ -3737,23 +3738,13 @@ define float @v_log2_f32_from_fpext_bf16(bfloat %src) { } define half @v_log2_f16(half %in) { -; SI-SDAG-LABEL: v_log2_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log2_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log2_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_log2_f16: ; VI: ; %bb.0: @@ -3805,23 +3796,13 @@ define half @v_log2_f16(half %in) { } define half @v_log2_fabs_f16(half %in) { -; SI-SDAG-LABEL: v_log2_fabs_f16: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log2_fabs_f16: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log2_fabs_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_log2_fabs_f16: ; VI: ; %bb.0: @@ -3877,11 +3858,10 @@ define half @v_log2_fneg_fabs_f16(half %in) { ; SI-SDAG-LABEL: v_log2_fneg_fabs_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -|v0| ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log2_fneg_fabs_f16: @@ -3947,11 +3927,10 @@ define half @v_log2_fneg_f16(half %in) { ; SI-SDAG-LABEL: v_log2_fneg_f16: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_log2_fneg_f16: @@ -4013,21 +3992,13 @@ define half @v_log2_fneg_f16(half %in) { } define half @v_log2_f16_fast(half %in) { -; SI-SDAG-LABEL: v_log2_f16_fast: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log2_f16_fast: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_log2_f16_fast: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_log_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_log2_f16_fast: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index 6d371d4b76e0b..3c27adde10b78 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -13,14 +13,13 @@ define half @v_maximum_f16(half %src0, half %src1) { ; GFX7-LABEL: v_maximum_f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX7-NEXT: v_max_f32_e32 v3, v0, v1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_f16: @@ -100,11 +99,10 @@ define half @v_maximum_f16__nnan(half %src0, half %src1) { ; GFX7-LABEL: v_maximum_f16__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_f16__nnan: @@ -164,14 +162,13 @@ define half @v_maximum_f16__nsz(half %src0, half %src1) { ; GFX7-LABEL: v_maximum_f16__nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX7-NEXT: v_max_f32_e32 v3, v0, v1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_f16__nsz: @@ -251,11 +248,10 @@ define half @v_maximum_f16__nnan_nsz(half %src0, half %src1) { ; GFX7-LABEL: v_maximum_f16__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_f16__nnan_nsz: @@ -315,15 +311,14 @@ define half @v_maximum_f16__nnan_src0(half %arg0, half %src1) { ; GFX7-LABEL: v_maximum_f16__nnan_src0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX7-NEXT: v_add_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v3, v0, v1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_f16__nnan_src0: @@ -414,15 +409,14 @@ define half @v_maximum_f16__nnan_src1(half %src0, half %arg1) { ; GFX7-LABEL: v_maximum_f16__nnan_src1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX7-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_max_f32_e32 v3, v0, v1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_f16__nnan_src1: @@ -513,11 +507,9 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) { ; GFX7-LABEL: s_maximum_f16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s17 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, s16 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, s17 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s16 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_max_f32_e32 v3, v1, v0 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll index 24d6f4f84e816..e79324d7655fc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -179,11 +179,10 @@ entry: define amdgpu_ps half @minnum_f16_no_ieee(half %a, half %b) #0 { ; SI-LABEL: minnum_f16_no_ieee: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: minnum_f16_no_ieee: @@ -589,7 +588,6 @@ define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_readfirstlane_b32 s0, v0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: minnum_v2f16_no_ieee: diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 4233367b3d5bb..9778c61c44e6e 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -1572,7 +1572,6 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_ret_f16: @@ -1606,7 +1605,6 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, half 4.0 seq_cst ret half %result @@ -1967,7 +1965,6 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_ret_f16__offset: @@ -2002,7 +1999,6 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767 %result = atomicrmw fadd ptr addrspace(3) %gep, half 4.0 seq_cst @@ -3021,34 +3017,33 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_ret_f16__offset__align4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffe, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: ds_read_b32 v0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: v_add_f32_e32 v0, 4.0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767 %result = atomicrmw fadd ptr addrspace(3) %gep, half 4.0 seq_cst, align 4 @@ -3754,7 +3749,6 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_ret_bf16: @@ -3788,7 +3782,6 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(3) %ptr, bfloat 4.0 seq_cst ret bfloat %result @@ -4220,7 +4213,6 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_ret_bf16__offset: @@ -4255,7 +4247,6 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767 %result = atomicrmw fadd ptr addrspace(3) %gep, bfloat 4.0 seq_cst @@ -5492,34 +5483,33 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fadd_ret_bf16__offset__align4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffe, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: ds_read_b32 v0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v0, 4.0, v0 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767 %result = atomicrmw fadd ptr addrspace(3) %gep, bfloat 4.0 seq_cst, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index 4dd7f0e3c450a..91add012bdcfa 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -1127,7 +1127,6 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_ret_f16: @@ -1161,7 +1160,6 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr addrspace(3) %ptr, half 4.0 seq_cst ret half %result @@ -1533,7 +1531,6 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_ret_f16__offset: @@ -1568,7 +1565,6 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767 %result = atomicrmw fmax ptr addrspace(3) %gep, half 4.0 seq_cst @@ -2620,34 +2616,33 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_ret_f16__offset__align4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffe, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: ds_read_b32 v0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_max_f32_e32 v1, 4.0, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767 %result = atomicrmw fmax ptr addrspace(3) %gep, half 4.0 seq_cst, align 4 @@ -3365,7 +3360,6 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_ret_bf16: @@ -3400,7 +3394,6 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr addrspace(3) %ptr, bfloat 4.0 seq_cst ret bfloat %result @@ -3833,7 +3826,6 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_ret_bf16__offset: @@ -3869,7 +3861,6 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767 %result = atomicrmw fmax ptr addrspace(3) %gep, bfloat 4.0 seq_cst @@ -5111,35 +5102,34 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmax_ret_bf16__offset__align4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffe, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: ds_read_b32 v0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_max_f32_e32 v0, 4.0, v0 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767 %result = atomicrmw fmax ptr addrspace(3) %gep, bfloat 4.0 seq_cst, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index 57fe5f708e216..8597c2e256584 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -1127,7 +1127,6 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_ret_f16: @@ -1161,7 +1160,6 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr addrspace(3) %ptr, half 4.0 seq_cst ret half %result @@ -1533,7 +1531,6 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_ret_f16__offset: @@ -1568,7 +1565,6 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767 %result = atomicrmw fmin ptr addrspace(3) %gep, half 4.0 seq_cst @@ -2620,34 +2616,33 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_ret_f16__offset__align4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffe, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: ds_read_b32 v0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_min_f32_e32 v1, 4.0, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767 %result = atomicrmw fmin ptr addrspace(3) %gep, half 4.0 seq_cst, align 4 @@ -3365,7 +3360,6 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_ret_bf16: @@ -3400,7 +3394,6 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr addrspace(3) %ptr, bfloat 4.0 seq_cst ret bfloat %result @@ -3833,7 +3826,6 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_ret_bf16__offset: @@ -3869,7 +3861,6 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767 %result = atomicrmw fmin ptr addrspace(3) %gep, bfloat 4.0 seq_cst @@ -5111,35 +5102,34 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fmin_ret_bf16__offset__align4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffe, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: ds_read_b32 v0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_min_f32_e32 v1, 4.0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_min_f32_e32 v0, 4.0, v0 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767 %result = atomicrmw fmin ptr addrspace(3) %gep, bfloat 4.0 seq_cst, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index 3a971a3b5a8d2..290d3117cac9a 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -2034,7 +2034,6 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_ret_f16: @@ -2068,7 +2067,6 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(3) %ptr, half 4.0 seq_cst ret half %result @@ -2429,7 +2427,6 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_ret_f16__offset: @@ -2464,7 +2461,6 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767 %result = atomicrmw fsub ptr addrspace(3) %gep, half 4.0 seq_cst @@ -3483,34 +3479,33 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX7-NEXT: s_cbranch_execnz .LBB12_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_ret_f16__offset__align4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffe, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: ds_read_b32 v0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_add_f32_e32 v1, -4.0, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: v_add_f32_e32 v0, -4.0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB12_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767 %result = atomicrmw fsub ptr addrspace(3) %gep, half 4.0 seq_cst, align 4 @@ -4216,7 +4211,6 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_ret_bf16: @@ -4250,7 +4244,6 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr addrspace(3) %ptr, bfloat 4.0 seq_cst ret bfloat %result @@ -4682,7 +4675,6 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_ret_bf16__offset: @@ -4717,7 +4709,6 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767 %result = atomicrmw fsub ptr addrspace(3) %gep, bfloat 4.0 seq_cst @@ -5954,34 +5945,33 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) ; GFX7-NEXT: s_cbranch_execnz .LBB18_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: local_atomic_fsub_ret_bf16__offset__align4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffe, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: ds_read_b32 v0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v1, -4.0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX6-NEXT: v_add_f32_e32 v0, -4.0, v0 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB18_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767 %result = atomicrmw fsub ptr addrspace(3) %gep, bfloat 4.0 seq_cst, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll index 3d48ff437e8ff..7dc9304d5715b 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll @@ -38,6 +38,9 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo(half %src0, half %s ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -101,6 +104,9 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %s ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -173,9 +179,12 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SDAG-CI-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SDAG-CI-NEXT: v_or_b32_e32 v0, v1, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] @@ -246,6 +255,9 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, ha ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -325,6 +337,9 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -419,6 +434,9 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half % ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -480,6 +498,9 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp @@ -560,10 +581,13 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 -; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: s_mov_b32 s7, 0xf000 ; SDAG-CI-NEXT: s_mov_b32 s6, -1 +; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v0 clamp ; SDAG-CI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll index c452f9701ca00..87d33c1c063eb 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -45,7 +45,6 @@ define half @mixlo_simple(float %src0, float %src1, float %src2) #0 { ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: mixlo_simple: @@ -92,7 +91,6 @@ define half @mixlo_simpl_no_flush(float %src0, float %src1, float %src2) { ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-CI-NEXT: v_fma_f32 v0, v0, v1, v2 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: mixlo_simpl_no_flush: @@ -138,9 +136,11 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f16lo(half %src0, half %src1, half %src ; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo: @@ -197,9 +197,11 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f16lo_no_flush(half %src0, half %src1, ; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo_no_flush: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_fma_f32 v0, v0, v1, v2 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo_no_flush: @@ -250,9 +252,10 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2 ; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32: @@ -301,9 +304,12 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr ; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: @@ -372,9 +378,10 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt(half %src0, half %src ; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: @@ -2524,7 +2531,6 @@ define half @mixlo_fptrunc(float %a, float %b) #0 { ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-CI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: mixlo_fptrunc: @@ -2571,7 +2577,6 @@ define half @mixlo_fptrunc_no_flush(float %a, float %b) { ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-CI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: mixlo_fptrunc_no_flush: @@ -2617,7 +2622,6 @@ define half @mixlo_fptrunc_abs_src_mod(float %a, float %b) #0 { ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-CI-NEXT: v_mul_f32_e64 v0, |v0|, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: mixlo_fptrunc_abs_src_mod: @@ -2664,7 +2668,6 @@ define half @mixlo_fptrunc_neg_src_mod(float %a, float %b) #0 { ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-CI-NEXT: v_mul_f32_e64 v0, -v0, v1 ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: mixlo_fptrunc_neg_src_mod: diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll index a252a63ca83e0..ee250fc74c7ae 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll @@ -56,7 +56,11 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2 ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-CI-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo: @@ -459,6 +463,9 @@ define float @v_mad_mix_f32_negf16lo_f16lo_f16lo(half %src0, half %src1, half %s ; SDAG-CI-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_mad_f32 v0, -v0, v1, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -536,7 +543,11 @@ define float @v_mad_mix_f32_absf16lo_f16lo_f16lo(half %src0, half %src1, half %s ; SDAG-CI-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_mad_f32 v0, |v0|, v1, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-CI-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo: @@ -595,7 +606,10 @@ define float @v_mad_mix_f32_negabsf16lo_f16lo_f16lo(half %src0, half %src1, half ; SDAG-CI-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_mad_f32 v0, -|v0|, v1, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SDAG-CI-NEXT: v_mad_f32 v0, -v0, v1, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo: @@ -653,6 +667,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32(half %src0, half %src1, float %src2) ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -707,6 +723,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_negf32(half %src0, half %src1, float %sr ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, -v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -762,6 +780,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_absf32(half %src0, half %src1, float %sr ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, |v2| ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -817,6 +837,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_negabsf32(half %src0, half %src1, float ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, -|v2| ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -881,6 +903,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32imm1(half %src0, half %src1) #0 { ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, 1.0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -961,6 +985,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32imminv2pi(half %src0, half %src1) #0 ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_madak_f32 v0, v0, v1, 0x3e22f983 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1048,6 +1074,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi(half %src0, half %src1) ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_madak_f32 v0, v0, v1, 0x3e230000 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1149,6 +1177,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 { ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_madak_f32 v0, v0, v1, 0x367c0000 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1814,6 +1844,9 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals(half %src0, half %sr ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_fma_f32 v0, v0, v1, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1873,6 +1906,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals(half %src0, half %src1, fl ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_fma_f32 v0, v0, v1, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -1955,6 +1990,9 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SDAG-CI-NEXT: v_add_f32_e32 v0, v0, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] @@ -2047,6 +2085,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SDAG-CI-NEXT: v_add_f32_e32 v0, v0, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] @@ -2116,7 +2156,11 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd(half %src0, hal ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-CI-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd: @@ -2173,6 +2217,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd(half %src0, half %src ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2230,6 +2276,8 @@ define float @v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo(i32 %src0.arg, half %src1 ; SDAG-CI-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: v_mad_f32 v0, -v0, v1, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] @@ -2329,9 +2377,11 @@ define float @v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo(i32 %src0.arg, half % ; SDAG-CI-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, |v3| +; SDAG-CI-NEXT: v_mac_f32_e32 v0, v2, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX1100-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: @@ -2403,9 +2453,11 @@ define float @v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo(i32 %src0.arg, half %src1 ; SDAG-CI-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, |v3| +; SDAG-CI-NEXT: v_mac_f32_e32 v0, v2, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-CI-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo: @@ -2467,9 +2519,11 @@ define float @v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half ; SDAG-CI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, -v3 +; SDAG-CI-NEXT: v_mac_f32_e32 v0, v2, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo: @@ -2552,9 +2606,11 @@ define float @v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo(i32 %src0.arg, half ; SDAG-CI-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, |v3| +; SDAG-CI-NEXT: v_mac_f32_e32 v0, v2, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo: @@ -2637,9 +2693,11 @@ define float @v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo(i32 %src0.arg, ; SDAG-CI-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo: ; SDAG-CI: ; %bb.0: ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, -|v3| +; SDAG-CI-NEXT: v_mac_f32_e32 v0, v2, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo: diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll index 65b2f016a6ba0..c7acbb0584904 100644 --- a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll @@ -13,12 +13,12 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) { ; GFX7-LABEL: v_maximumnum_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximumnum_bf16: @@ -222,12 +222,10 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) { ; GFX7-LABEL: v_maximumnum_bf16_nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximumnum_bf16_nnan: @@ -12322,12 +12320,10 @@ define bfloat @v_maximumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 { ; GFX7-LABEL: v_maximumnum_bf16_no_ieee: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximumnum_bf16_no_ieee: diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll index 7d52b2e1d70c6..086c78fd041fc 100644 --- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll @@ -27,24 +27,14 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16,GFX12-GISEL,GFX12-FAKE16-GISEL %s define half @v_maximumnum_f16(half %x, half %y) { -; GFX7-SDAG-LABEL: v_maximumnum_f16: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: v_maximumnum_f16: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: v_maximumnum_f16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_f16: ; GFX8-SDAG: ; %bb.0: @@ -189,11 +179,10 @@ define half @v_maximumnum_f16_nnan(half %x, half %y) { ; GFX7-SDAG-LABEL: v_maximumnum_f16_nnan: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_maximumnum_f16_nnan: @@ -259,21 +248,13 @@ define half @v_maximumnum_f16_nnan(half %x, half %y) { } define half @v_maximumnum_f16_1.0(half %x) { -; GFX7-SDAG-LABEL: v_maximumnum_f16_1.0: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, 1.0, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: v_maximumnum_f16_1.0: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, 1.0, v0 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: v_maximumnum_f16_1.0: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximumnum_f16_1.0: ; GFX8: ; %bb.0: @@ -924,24 +905,14 @@ define double @v_maximumnum_f64_1.0(double %x) { } define half @v_maximumnum_f16_s_v(half inreg %x, half %y) { -; GFX7-SDAG-LABEL: v_maximumnum_f16_s_v: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, s16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v1, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: v_maximumnum_f16_s_v: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, s16 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v1, v0 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: v_maximumnum_f16_s_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s16 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_max_f32_e32 v0, v1, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_f16_s_v: ; GFX8-SDAG: ; %bb.0: @@ -1099,24 +1070,14 @@ define half @v_maximumnum_f16_s_v(half inreg %x, half %y) { } define half @v_maximumnum_f16_v_s(half %x, half inreg %y) { -; GFX7-SDAG-LABEL: v_maximumnum_f16_v_s: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, s16 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: v_maximumnum_f16_v_s: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, s16 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: v_maximumnum_f16_v_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s16 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_f16_v_s: ; GFX8-SDAG: ; %bb.0: @@ -1274,24 +1235,14 @@ define half @v_maximumnum_f16_v_s(half %x, half inreg %y) { } define half @v_maximumnum_f16_s_s(half inreg %x, half inreg %y) { -; GFX7-SDAG-LABEL: v_maximumnum_f16_s_s: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, s16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, s17 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: v_maximumnum_f16_s_s: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, s16 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, s17 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: v_maximumnum_f16_s_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, s16 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s17 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_f16_s_s: ; GFX8-SDAG: ; %bb.0: @@ -2612,24 +2563,14 @@ define float @v_maximumnum_f32_fneg(float %x, float %y) { } define half @v_maximumnum_f16_fabs_rhs(half %x, half %y) { -; GFX7-SDAG-LABEL: v_maximumnum_f16_fabs_rhs: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: v_maximumnum_f16_fabs_rhs: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: v_maximumnum_f16_fabs_rhs: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_f16_fabs_rhs: ; GFX8-SDAG: ; %bb.0: @@ -2775,11 +2716,11 @@ define half @v_maximumnum_f16_fneg_fabs_rhs(half %x, half %y) { ; GFX7-SDAG-LABEL: v_maximumnum_f16_fneg_fabs_rhs: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, -|v1| ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_maximumnum_f16_fneg_fabs_rhs: @@ -2933,24 +2874,14 @@ define half @v_maximumnum_f16_fneg_fabs_rhs(half %x, half %y) { } define half @v_maximumnum_f16_fabs(half %x, half %y) { -; GFX7-SDAG-LABEL: v_maximumnum_f16_fabs: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: v_maximumnum_f16_fabs: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: v_maximumnum_f16_fabs: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; GFX7-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_maximumnum_f16_fabs: ; GFX8-SDAG: ; %bb.0: @@ -3097,11 +3028,12 @@ define half @v_maximumnum_f16_fneg(half %x, half %y) { ; GFX7-SDAG-LABEL: v_maximumnum_f16_fneg: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e64 v1, -v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_maximumnum_f16_fneg: @@ -8222,11 +8154,10 @@ define half @v_maximumnum_f16_no_ieee(half %x, half %y) #0 { ; GFX7-SDAG-LABEL: v_maximumnum_f16_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_maximumnum_f16_no_ieee: @@ -8327,11 +8258,10 @@ define half @v_maximumnum_f16_nan_no_ieee(half %x, half %y) #0 { ; GFX7-SDAG-LABEL: v_maximumnum_f16_nan_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_maximumnum_f16_nan_no_ieee: diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll index a3c9977fee488..0a794a3ac49b1 100644 --- a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll @@ -13,12 +13,12 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) { ; GFX7-LABEL: v_minimumnum_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimumnum_bf16: @@ -224,12 +224,10 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) { ; GFX7-LABEL: v_minimumnum_bf16_nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimumnum_bf16_nnan: @@ -12355,12 +12353,10 @@ define bfloat @v_minimumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 { ; GFX7-LABEL: v_minimumnum_bf16_no_ieee: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimumnum_bf16_no_ieee: diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll index 329118e3dca01..0311caf93a14e 100644 --- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll @@ -27,24 +27,14 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16,GFX12-GISEL,GFX12-FAKE16-GISEL %s define half @v_minimumnum_f16(half %x, half %y) { -; GFX7-SDAG-LABEL: v_minimumnum_f16: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: v_minimumnum_f16: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: v_minimumnum_f16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_f16: ; GFX8-SDAG: ; %bb.0: @@ -189,11 +179,10 @@ define half @v_minimumnum_f16_nnan(half %x, half %y) { ; GFX7-SDAG-LABEL: v_minimumnum_f16_nnan: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_minimumnum_f16_nnan: @@ -259,21 +248,13 @@ define half @v_minimumnum_f16_nnan(half %x, half %y) { } define half @v_minimumnum_f16_1.0(half %x) { -; GFX7-SDAG-LABEL: v_minimumnum_f16_1.0: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, 1.0, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: v_minimumnum_f16_1.0: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, 1.0, v0 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: v_minimumnum_f16_1.0: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_min_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimumnum_f16_1.0: ; GFX8: ; %bb.0: @@ -924,24 +905,14 @@ define double @v_minimumnum_f64_1.0(double %x) { } define half @v_minimumnum_f16_v_s(half %x, half inreg %y) { -; GFX7-SDAG-LABEL: v_minimumnum_f16_v_s: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, s16 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: v_minimumnum_f16_v_s: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, s16 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: v_minimumnum_f16_v_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s16 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_f16_v_s: ; GFX8-SDAG: ; %bb.0: @@ -1099,24 +1070,14 @@ define half @v_minimumnum_f16_v_s(half %x, half inreg %y) { } define half @v_minimumnum_f16_s_s(half inreg %x, half inreg %y) { -; GFX7-SDAG-LABEL: v_minimumnum_f16_s_s: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, s16 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, s17 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: v_minimumnum_f16_s_s: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, s16 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, s17 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: v_minimumnum_f16_s_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, s16 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s17 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_f16_s_s: ; GFX8-SDAG: ; %bb.0: @@ -2437,24 +2398,14 @@ define float @v_minimumnum_f32_fneg(float %x, float %y) { } define half @v_minimumnum_f16_fabs_rhs(half %x, half %y) { -; GFX7-SDAG-LABEL: v_minimumnum_f16_fabs_rhs: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: v_minimumnum_f16_fabs_rhs: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: v_minimumnum_f16_fabs_rhs: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_f16_fabs_rhs: ; GFX8-SDAG: ; %bb.0: @@ -2600,11 +2551,11 @@ define half @v_minimumnum_f16_fneg_fabs_rhs(half %x, half %y) { ; GFX7-SDAG-LABEL: v_minimumnum_f16_fneg_fabs_rhs: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, -|v1| ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_minimumnum_f16_fneg_fabs_rhs: @@ -2758,24 +2709,14 @@ define half @v_minimumnum_f16_fneg_fabs_rhs(half %x, half %y) { } define half @v_minimumnum_f16_fabs(half %x, half %y) { -; GFX7-SDAG-LABEL: v_minimumnum_f16_fabs: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: v_minimumnum_f16_fabs: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: v_minimumnum_f16_fabs: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; GFX7-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: v_minimumnum_f16_fabs: ; GFX8-SDAG: ; %bb.0: @@ -2922,11 +2863,12 @@ define half @v_minimumnum_f16_fneg(half %x, half %y) { ; GFX7-SDAG-LABEL: v_minimumnum_f16_fneg: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e64 v1, -v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_minimumnum_f16_fneg: @@ -8047,11 +7989,10 @@ define half @v_minimumnum_f16_no_ieee(half %x, half %y) #0 { ; GFX7-SDAG-LABEL: v_minimumnum_f16_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_minimumnum_f16_no_ieee: @@ -8152,11 +8093,10 @@ define half @v_minimumnum_f16_nan_no_ieee(half %x, half %y) #0 { ; GFX7-SDAG-LABEL: v_minimumnum_f16_nan_no_ieee: ; GFX7-SDAG: ; %bb.0: ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: v_minimumnum_f16_nan_no_ieee: diff --git a/llvm/test/CodeGen/AMDGPU/omod.ll b/llvm/test/CodeGen/AMDGPU/omod.ll index 9371ce57dc0fe..90632c663bf4a 100644 --- a/llvm/test/CodeGen/AMDGPU/omod.ll +++ b/llvm/test/CodeGen/AMDGPU/omod.ll @@ -1083,10 +1083,9 @@ define amdgpu_ps void @v_omod_mul2_f64_denormals(double %a) #2 { define amdgpu_ps void @v_omod_div2_f16_denormals(half %a) #0 { ; SI-LABEL: v_omod_div2_f16_denormals: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -1140,10 +1139,9 @@ define amdgpu_ps void @v_omod_div2_f16_denormals(half %a) #0 { define amdgpu_ps void @v_omod_mul2_f16_denormals(half %a) #0 { ; SI-LABEL: v_omod_mul2_f16_denormals: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -1196,10 +1194,9 @@ define amdgpu_ps void @v_omod_mul2_f16_denormals(half %a) #0 { define amdgpu_ps void @v_omod_div2_f16_no_denormals(half %a) #3 { ; SI-LABEL: v_omod_div2_f16_no_denormals: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll index 4e70640609fef..18e2d3d11265b 100644 --- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll @@ -462,8 +462,8 @@ define bfloat @atomicrmw_fadd_private_bf16(ptr addrspace(5) %ptr) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: buffer_load_ushort v1, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_add_f32_e32 v2, 2.0, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_add_f32_e32 v2, 2.0, v2 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GCN-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll index 38d0a377a3ffb..ff3a735bd32b4 100644 --- a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll +++ b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll @@ -246,17 +246,14 @@ define <2 x half> @v_repeat_divisor_f16_x2_arcp(half %x, half %y, half %D) #0 { ; GFX6-LABEL: v_repeat_divisor_f16_x2_arcp: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, 1.0 ; GFX6-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX6-NEXT: v_fma_f32 v4, v5, v4, v4 ; GFX6-NEXT: v_div_scale_f32 v5, vcc, 1.0, v2, 1.0 -; GFX6-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-NEXT: v_fma_f32 v4, v6, v4, v4 ; GFX6-NEXT: v_mul_f32_e32 v6, v5, v4 ; GFX6-NEXT: v_fma_f32 v7, -v3, v6, v5 ; GFX6-NEXT: v_fma_f32 v6, v7, v4, v6 @@ -530,10 +527,6 @@ define <3 x half> @v_repeat_divisor_f16_x3_arcp(half %x, half %y, half %z, half ; GFX6-LABEL: v_repeat_divisor_f16_x3_arcp: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll index 0b7e4e90dc317..99d494d4feaf4 100644 --- a/llvm/test/CodeGen/AMDGPU/roundeven.ll +++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll @@ -380,17 +380,17 @@ define half @v_roundeven_f16(half %x) { ; SDAG_GFX6-LABEL: v_roundeven_f16: ; SDAG_GFX6: ; %bb.0: ; SDAG_GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SDAG_GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG_GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SDAG_GFX6-NEXT: s_setpc_b64 s[30:31] ; ; SDAG_GFX7-LABEL: v_roundeven_f16: ; SDAG_GFX7: ; %bb.0: ; SDAG_GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SDAG_GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG_GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SDAG_GFX7-NEXT: s_setpc_b64 s[30:31] ; ; SDAG_GFX8-LABEL: v_roundeven_f16: diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll index 1222d0efd62bb..9ae6e60385bbb 100644 --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll @@ -13,15 +13,13 @@ define half @add_select_fabs_fabs_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_fabs_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-NEXT: v_add_f32_e64 v0, |v0|, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fabs_fabs_f16: @@ -79,10 +77,6 @@ define { half, half } @add_select_multi_use_lhs_fabs_fabs_f16(i32 %c, half %x, h ; CI-LABEL: add_select_multi_use_lhs_fabs_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 @@ -91,6 +85,8 @@ define { half, half } @add_select_multi_use_lhs_fabs_fabs_f16(i32 %c, half %x, h ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-NEXT: v_add_f32_e64 v0, |v0|, v4 ; CI-NEXT: v_add_f32_e64 v1, |v1|, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_multi_use_lhs_fabs_fabs_f16: @@ -156,16 +152,14 @@ define { half, half } @add_select_multi_store_use_lhs_fabs_fabs_f16(i32 %c, half ; CI-LABEL: add_select_multi_store_use_lhs_fabs_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v4 -; CI-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-NEXT: v_add_f32_e64 v0, |v0|, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e64 v1, |v1| ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_multi_store_use_lhs_fabs_fabs_f16: @@ -231,10 +225,6 @@ define { half, half } @add_select_multi_use_rhs_fabs_fabs_f16(i32 %c, half %x, h ; CI-LABEL: add_select_multi_use_rhs_fabs_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 @@ -243,6 +233,8 @@ define { half, half } @add_select_multi_use_rhs_fabs_fabs_f16(i32 %c, half %x, h ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-NEXT: v_add_f32_e64 v0, |v0|, v3 ; CI-NEXT: v_add_f32_e64 v1, |v2|, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_multi_use_rhs_fabs_fabs_f16: @@ -308,15 +300,13 @@ define half @add_select_fabs_var_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_fabs_var_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fabs_var_f16: @@ -378,13 +368,12 @@ define half @add_select_fabs_negk_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_fabs_negk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v1, vcc ; CI-NEXT: v_add_f32_e32 v0, v0, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fabs_negk_f16: @@ -448,11 +437,11 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) { ; CI-LABEL: add_select_fabs_negk_negk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_sub_f32_e32 v0, v1, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fabs_negk_negk_f16: @@ -515,11 +504,11 @@ define half @add_select_posk_posk_f16(i32 %c, half %x) { ; CI-LABEL: add_select_posk_posk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e64 v0, 1.0, 2.0, vcc -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_posk_posk_f16: @@ -581,13 +570,12 @@ define half @add_select_negk_fabs_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_negk_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v1, vcc ; CI-NEXT: v_add_f32_e32 v0, v0, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_negk_fabs_f16: @@ -650,14 +638,13 @@ define half @add_select_negliteralk_fabs_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_negliteralk_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_mov_b32_e32 v3, 0xc4800000 -; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_mov_b32_e32 v3, 0xc4800000 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; CI-NEXT: v_add_f32_e32 v0, v0, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_negliteralk_fabs_f16: @@ -720,13 +707,12 @@ define half @add_select_fabs_posk_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_fabs_posk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; CI-NEXT: v_add_f32_e64 v0, |v0|, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fabs_posk_f16: @@ -784,13 +770,12 @@ define half @add_select_posk_fabs_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_posk_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; CI-NEXT: v_add_f32_e64 v0, |v0|, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_posk_fabs_f16: @@ -848,15 +833,13 @@ define half @add_select_fneg_fneg_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_fneg_fneg_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-NEXT: v_sub_f32_e32 v0, v3, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fneg_fneg_f16: @@ -914,10 +897,6 @@ define { half, half } @add_select_multi_use_lhs_fneg_fneg_f16(i32 %c, half %x, h ; CI-LABEL: add_select_multi_use_lhs_fneg_fneg_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 @@ -926,6 +905,8 @@ define { half, half } @add_select_multi_use_lhs_fneg_fneg_f16(i32 %c, half %x, h ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-NEXT: v_sub_f32_e32 v0, v3, v0 ; CI-NEXT: v_sub_f32_e32 v1, v4, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_multi_use_lhs_fneg_fneg_f16: @@ -991,16 +972,15 @@ define { half, half } @add_select_multi_store_use_lhs_fneg_fneg_f16(i32 %c, half ; CI-LABEL: add_select_multi_store_use_lhs_fneg_fneg_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e64 v1, -v1 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; CI-NEXT: v_sub_f32_e32 v0, v3, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_xor_b32_e32 v1, 0x8000, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_multi_store_use_lhs_fneg_fneg_f16: @@ -1066,10 +1046,6 @@ define { half, half } @add_select_multi_use_rhs_fneg_fneg_f16(i32 %c, half %x, h ; CI-LABEL: add_select_multi_use_rhs_fneg_fneg_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 @@ -1078,6 +1054,8 @@ define { half, half } @add_select_multi_use_rhs_fneg_fneg_f16(i32 %c, half %x, h ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-NEXT: v_sub_f32_e32 v0, v3, v0 ; CI-NEXT: v_sub_f32_e32 v1, v4, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_multi_use_rhs_fneg_fneg_f16: @@ -1143,15 +1121,14 @@ define half @add_select_fneg_var_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_fneg_var_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e64 v1, -v1 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fneg_var_f16: @@ -1213,13 +1190,12 @@ define half @add_select_fneg_negk_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_fneg_negk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; CI-NEXT: v_sub_f32_e32 v0, v2, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fneg_negk_f16: @@ -1277,14 +1253,13 @@ define half @add_select_fneg_inv2pi_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_fneg_inv2pi_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_mov_b32_e32 v3, 0xbe230000 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_mov_b32_e32 v3, 0xbe230000 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; CI-NEXT: v_sub_f32_e32 v0, v2, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fneg_inv2pi_f16: @@ -1342,14 +1317,13 @@ define half @add_select_fneg_neginv2pi_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_fneg_neginv2pi_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_mov_b32_e32 v3, 0x3e230000 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_mov_b32_e32 v3, 0x3e230000 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; CI-NEXT: v_sub_f32_e32 v0, v2, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fneg_neginv2pi_f16: @@ -1407,11 +1381,11 @@ define half @add_select_negk_negk_f16(i32 %c, half %x) { ; CI-LABEL: add_select_negk_negk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_add_f32_e32 v0, v0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_negk_negk_f16: @@ -1473,13 +1447,13 @@ define half @add_select_negliteralk_negliteralk_f16(i32 %c, half %x) { ; CI-LABEL: add_select_negliteralk_negliteralk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_mov_b32_e32 v2, 0xc5800000 ; CI-NEXT: v_mov_b32_e32 v3, 0xc5000000 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; CI-NEXT: v_add_f32_e32 v0, v0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_negliteralk_negliteralk_f16: @@ -1541,11 +1515,11 @@ define half @add_select_fneg_negk_negk_f16(i32 %c, half %x) { ; CI-LABEL: add_select_fneg_negk_negk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_sub_f32_e32 v0, v1, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fneg_negk_negk_f16: @@ -1608,13 +1582,12 @@ define half @add_select_negk_fneg_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_negk_fneg_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; CI-NEXT: v_sub_f32_e32 v0, v2, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_negk_fneg_f16: @@ -1672,13 +1645,12 @@ define half @add_select_fneg_posk_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_fneg_posk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v1, vcc ; CI-NEXT: v_sub_f32_e32 v0, v2, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fneg_posk_f16: @@ -1736,13 +1708,12 @@ define half @add_select_posk_fneg_f16(i32 %c, half %x, half %y) { ; CI-LABEL: add_select_posk_fneg_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v1, vcc ; CI-NEXT: v_sub_f32_e32 v0, v2, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_posk_fneg_f16: @@ -1800,15 +1771,14 @@ define half @add_select_negfabs_fabs_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_negfabs_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1| +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1| ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_negfabs_fabs_f16: @@ -1877,15 +1847,14 @@ define half @add_select_fabs_negfabs_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_fabs_negfabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| ; CI-NEXT: v_cvt_f32_f16_e64 v2, -|v2| ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fabs_negfabs_f16: @@ -1954,15 +1923,14 @@ define half @add_select_neg_fabs_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_neg_fabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e64 v1, -v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| +; CI-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_neg_fabs_f16: @@ -2030,15 +1998,14 @@ define half @add_select_fabs_neg_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_fabs_neg_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e64 v2, -v2 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e64 v2, -v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fabs_neg_f16: @@ -2106,15 +2073,13 @@ define half @add_select_neg_negfabs_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_neg_negfabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; CI-NEXT: v_sub_f32_e32 v0, v3, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_neg_negfabs_f16: @@ -2178,15 +2143,13 @@ define half @add_select_negfabs_neg_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_negfabs_neg_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; CI-NEXT: v_sub_f32_e32 v0, v3, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_negfabs_neg_f16: @@ -2250,13 +2213,13 @@ define half @mul_select_negfabs_posk_f16(i32 %c, half %x, half %y) { ; CI-LABEL: mul_select_negfabs_posk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1| ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, 4.0, v1, vcc ; CI-NEXT: v_mul_f32_e32 v0, v0, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: mul_select_negfabs_posk_f16: @@ -2320,13 +2283,13 @@ define half @mul_select_posk_negfabs_f16(i32 %c, half %x, half %y) { ; CI-LABEL: mul_select_posk_negfabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1| ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, 4.0, v1, vcc ; CI-NEXT: v_mul_f32_e32 v0, v0, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: mul_select_posk_negfabs_f16: @@ -2390,13 +2353,13 @@ define half @mul_select_negfabs_negk_f16(i32 %c, half %x, half %y) { ; CI-LABEL: mul_select_negfabs_negk_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1| ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, -4.0, v1, vcc ; CI-NEXT: v_mul_f32_e32 v0, v0, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: mul_select_negfabs_negk_f16: @@ -2460,13 +2423,13 @@ define half @mul_select_negk_negfabs_f16(i32 %c, half %x, half %y) { ; CI-LABEL: mul_select_negk_negfabs_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1| ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e32 v0, -4.0, v1, vcc ; CI-NEXT: v_mul_f32_e32 v0, v0, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: mul_select_negk_negfabs_f16: @@ -2534,11 +2497,11 @@ define half @select_fneg_posk_src_add_f16(i32 %c, half %x, half %y) { ; CI-SAFE-LABEL: select_fneg_posk_src_add_f16: ; CI-SAFE: ; %bb.0: ; CI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-SAFE-NEXT: v_add_f32_e32 v1, 4.0, v1 ; CI-SAFE-NEXT: v_cndmask_b32_e64 v0, 2.0, -v1, vcc +; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: select_fneg_posk_src_add_f16: @@ -2574,11 +2537,11 @@ define half @select_fneg_posk_src_add_f16(i32 %c, half %x, half %y) { ; CI-NSZ-LABEL: select_fneg_posk_src_add_f16: ; CI-NSZ: ; %bb.0: ; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NSZ-NEXT: v_sub_f32_e32 v1, -4.0, v1 ; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, 2.0, v1, vcc +; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-NSZ-LABEL: select_fneg_posk_src_add_f16: @@ -2618,11 +2581,11 @@ define half @select_fneg_posk_src_sub_f16(i32 %c, half %x) { ; CI-SAFE-LABEL: select_fneg_posk_src_sub_f16: ; CI-SAFE: ; %bb.0: ; CI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-SAFE-NEXT: v_add_f32_e32 v1, -4.0, v1 ; CI-SAFE-NEXT: v_cndmask_b32_e64 v0, 2.0, -v1, vcc +; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: select_fneg_posk_src_sub_f16: @@ -2658,11 +2621,11 @@ define half @select_fneg_posk_src_sub_f16(i32 %c, half %x) { ; CI-NSZ-LABEL: select_fneg_posk_src_sub_f16: ; CI-NSZ: ; %bb.0: ; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NSZ-NEXT: v_sub_f32_e32 v1, 4.0, v1 ; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, 2.0, v1, vcc +; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-NSZ-LABEL: select_fneg_posk_src_sub_f16: @@ -2702,11 +2665,11 @@ define half @select_fneg_posk_src_mul_f16(i32 %c, half %x) { ; CI-LABEL: select_fneg_posk_src_mul_f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_mul_f32_e32 v1, -4.0, v1 ; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v1, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: select_fneg_posk_src_mul_f16: @@ -2764,13 +2727,12 @@ define half @select_fneg_posk_src_fma_f16(i32 %c, half %x, half %z) { ; CI-SAFE-LABEL: select_fneg_posk_src_fma_f16: ; CI-SAFE: ; %bb.0: ; CI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-SAFE-NEXT: v_fma_f32 v1, v1, 4.0, v2 ; CI-SAFE-NEXT: v_cndmask_b32_e64 v0, 2.0, -v1, vcc +; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: select_fneg_posk_src_fma_f16: @@ -2806,13 +2768,12 @@ define half @select_fneg_posk_src_fma_f16(i32 %c, half %x, half %z) { ; CI-NSZ-LABEL: select_fneg_posk_src_fma_f16: ; CI-NSZ: ; %bb.0: ; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NSZ-NEXT: v_fma_f32 v1, v1, -4.0, -v2 ; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, 2.0, v1, vcc +; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-NSZ-LABEL: select_fneg_posk_src_fma_f16: @@ -2852,14 +2813,13 @@ define half @select_fneg_posk_src_fmad_f16(i32 %c, half %x, half %z) { ; CI-SAFE-LABEL: select_fneg_posk_src_fmad_f16: ; CI-SAFE: ; %bb.0: ; CI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; CI-SAFE-NEXT: v_add_f32_e32 v1, v1, v2 ; CI-SAFE-NEXT: v_cndmask_b32_e64 v0, 2.0, -v1, vcc +; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: select_fneg_posk_src_fmad_f16: @@ -2895,14 +2855,13 @@ define half @select_fneg_posk_src_fmad_f16(i32 %c, half %x, half %z) { ; CI-NSZ-LABEL: select_fneg_posk_src_fmad_f16: ; CI-NSZ: ; %bb.0: ; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NSZ-NEXT: v_mul_f32_e32 v1, -4.0, v1 ; CI-NSZ-NEXT: v_sub_f32_e32 v1, v1, v2 ; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, 2.0, v1, vcc +; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-NSZ-LABEL: select_fneg_posk_src_fmad_f16: diff --git a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll index 0de366132e31e..c7422a25f71e7 100644 --- a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll @@ -521,11 +521,10 @@ define half @v_test_fmin_legacy_ule_f16_safe(half %a, half %b) { ; GFX7-LABEL: v_test_fmin_legacy_ule_f16_safe: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_f16_safe: @@ -567,11 +566,10 @@ define half @v_test_fmin_legacy_ule_f16_nnan_flag(half %a, half %b) { ; GFX7-LABEL: v_test_fmin_legacy_ule_f16_nnan_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_f16_nnan_flag: @@ -613,11 +611,10 @@ define half @v_test_fmin_legacy_ule_f16_nsz_flag(half %a, half %b) { ; GFX7-LABEL: v_test_fmin_legacy_ule_f16_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_f16_nsz_flag: @@ -659,11 +656,10 @@ define half @v_test_fmin_legacy_ule_f16_nnan_nsz_flag(half %a, half %b) { ; GFX7-LABEL: v_test_fmin_legacy_ule_f16_nnan_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmin_legacy_ule_f16_nnan_nsz_flag: @@ -700,11 +696,10 @@ define half @v_test_fmax_legacy_uge_f16_safe(half %a, half %b) { ; GFX7-LABEL: v_test_fmax_legacy_uge_f16_safe: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_f16_safe: @@ -746,11 +741,10 @@ define half @v_test_fmax_legacy_uge_f16_nnan_flag(half %a, half %b) { ; GFX7-LABEL: v_test_fmax_legacy_uge_f16_nnan_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_f16_nnan_flag: @@ -792,11 +786,10 @@ define half @v_test_fmax_legacy_uge_f16_nsz_flag(half %a, half %b) { ; GFX7-LABEL: v_test_fmax_legacy_uge_f16_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_f16_nsz_flag: @@ -838,11 +831,10 @@ define half @v_test_fmax_legacy_uge_f16_nnan_nsz_flag(half %a, half %b) { ; GFX7-LABEL: v_test_fmax_legacy_uge_f16_nnan_nsz_flag: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_fmax_legacy_uge_f16_nnan_nsz_flag: diff --git a/llvm/test/CodeGen/AMDGPU/strict_fpext.ll b/llvm/test/CodeGen/AMDGPU/strict_fpext.ll index 5335787a820be..9a52b96bde709 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fpext.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fpext.ll @@ -10,8 +10,6 @@ define float @v_constrained_fpext_f16_to_f32_fpexcept_strict(half %arg) #0 { ; SI-LABEL: v_constrained_fpext_f16_to_f32_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -199,8 +197,6 @@ define double @v_constrained_fpext_f16_to_f64_fpexcept_strict(half %arg) #0 { ; SI-LABEL: v_constrained_fpext_f16_to_f64_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -352,10 +348,7 @@ define float @v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict(half %arg) #0 ; SI-LABEL: v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict: @@ -394,10 +387,8 @@ define float @v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict(half %arg) #0 ; SI-LABEL: v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict: diff --git a/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll b/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll index dc57c22f16a26..31c64046de11a 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll @@ -13,6 +13,7 @@ define half @v_constrained_fptrunc_f32_to_f16_fpexcept_strict(float %arg) #0 { ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict: @@ -227,7 +228,7 @@ define half @v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict(float %arg) # ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict: @@ -269,6 +270,7 @@ define half @v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict(float %arg) # ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict: diff --git a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll index 8e43f4e788bb0..9fe064c717972 100644 --- a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll +++ b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll @@ -17,8 +17,6 @@ define void @f16_arg(half %arg, ptr %ptr) #0 { ; GFX7-LABEL: f16_arg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: flat_store_dword v[1:2], v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -102,6 +100,7 @@ define half @f16_return(float %arg) #0 { ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %fptrunc @@ -194,8 +193,6 @@ define void @outgoing_f16_arg(ptr %ptr) #0 { ; GFX7-NEXT: s_mov_b32 s16, f16_user@abs32@lo ; GFX7-NEXT: s_addk_i32 s32, 0x400 ; GFX7-NEXT: v_writelane_b32 v40, s31, 1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7-NEXT: v_readlane_b32 s31, v40, 1 ; GFX7-NEXT: v_readlane_b32 s30, v40, 0 @@ -264,17 +261,13 @@ define void @outgoing_f16_return(ptr %ptr) #0 { ; GFX7-NEXT: v_mov_b32_e32 v41, v1 ; GFX7-NEXT: v_mov_b32_e32 v40, v0 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: flat_store_short v[40:41], v0 +; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX7-NEXT: v_readlane_b32 s31, v42, 1 ; GFX7-NEXT: v_readlane_b32 s30, v42, 0 ; GFX7-NEXT: s_mov_b32 s32, s33 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_readlane_b32 s4, v42, 2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: flat_store_short v[40:41], v0 -; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[6:7] @@ -436,6 +429,7 @@ define half @call_split_type_used_outside_block_v8f16() #0 { ; GFX7-NEXT: v_readlane_b32 s4, v40, 2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll index 98e7df04be444..deb140fa7e941 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll @@ -20,13 +20,13 @@ define half @test_vector_reduce_fadd_v2half(half %sp, <2 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fadd_v2half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fadd_v2half: @@ -158,15 +158,15 @@ define half @test_vector_reduce_fadd_v3half(half %sp, <3 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fadd_v3half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fadd_v3half: @@ -311,11 +311,10 @@ define half @test_vector_reduce_fadd_v4half(half %sp, <4 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fadd_v4half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 @@ -323,6 +322,7 @@ define half @test_vector_reduce_fadd_v4half(half %sp, <4 x half> %v) { ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fadd_v4half: @@ -499,11 +499,10 @@ define half @test_vector_reduce_fadd_v8half(half %sp, <8 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fadd_v8half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 @@ -521,6 +520,7 @@ define half @test_vector_reduce_fadd_v8half(half %sp, <8 x half> %v) { ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v6 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fadd_v8half: @@ -787,11 +787,10 @@ define half @test_vector_reduce_fadd_v16half(half %sp, <16 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fadd_v16half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v16, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 @@ -829,6 +828,7 @@ define half @test_vector_reduce_fadd_v16half(half %sp, <16 x half> %v) { ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v10 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v8 ; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v9 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fadd_v16half: diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll index b9dcb1b7295c2..4c212daab39ee 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll @@ -24,6 +24,7 @@ define half @test_vector_reduce_fmax_v2half(<2 x half> %v) { ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmax_v2half: @@ -189,6 +190,7 @@ define half @test_vector_reduce_fmax_v3half(<3 x half> %v) { ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v2, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmax_v3half: @@ -396,6 +398,7 @@ define half @test_vector_reduce_fmax_v4half(<4 x half> %v) { ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v3 ; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmax_v4half: @@ -631,6 +634,7 @@ define half @test_vector_reduce_fmax_v8half(<8 x half> %v) { ; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v1, v6 ; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v2, v5 ; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v3, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmax_v8half: @@ -994,6 +998,7 @@ define half @test_vector_reduce_fmax_v16half(<16 x half> %v) { ; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v5, v10 ; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v6, v9 ; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v7, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmax_v16half: diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll index 58da94d7c4683..d198bb45654da 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll @@ -19,6 +19,7 @@ define half @test_vector_reduce_fmaximum_v2half(<2 x half> %v) { ; GFX7-NEXT: v_max_f32_e32 v3, v0, v1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_vector_reduce_fmaximum_v2half: @@ -109,6 +110,7 @@ define half @test_vector_reduce_fmaximum_v3half(<3 x half> %v) { ; GFX7-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_vector_reduce_fmaximum_v3half: @@ -230,6 +232,7 @@ define half @test_vector_reduce_fmaximum_v4half(<4 x half> %v) { ; GFX7-NEXT: v_max_f32_e32 v1, v0, v3 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_vector_reduce_fmaximum_v4half: @@ -384,6 +387,7 @@ define half @test_vector_reduce_fmaximum_v8half(<8 x half> %v) { ; GFX7-NEXT: v_max_f32_e32 v1, v0, v4 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_vector_reduce_fmaximum_v8half: @@ -646,6 +650,7 @@ define half @test_vector_reduce_fmaximum_v16half(<16 x half> %v) { ; GFX7-NEXT: v_max_f32_e32 v1, v0, v8 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_vector_reduce_fmaximum_v16half: diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll index 07524d6917740..479dc08a4f7aa 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll @@ -24,6 +24,7 @@ define half @test_vector_reduce_fmin_v2half(<2 x half> %v) { ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmin_v2half: @@ -189,6 +190,7 @@ define half @test_vector_reduce_fmin_v3half(<3 x half> %v) { ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v2, v1 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmin_v3half: @@ -396,6 +398,7 @@ define half @test_vector_reduce_fmin_v4half(<4 x half> %v) { ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v3 ; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v1, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmin_v4half: @@ -631,6 +634,7 @@ define half @test_vector_reduce_fmin_v8half(<8 x half> %v) { ; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v1, v6 ; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v2, v5 ; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v3, v4 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmin_v8half: @@ -994,6 +998,7 @@ define half @test_vector_reduce_fmin_v16half(<16 x half> %v) { ; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v5, v10 ; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v6, v9 ; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v7, v8 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmin_v16half: diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll index 16732a429e4b0..506d847c1144b 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll @@ -21,6 +21,7 @@ define half @test_vector_reduce_fminimum_v2half(<2 x half> %v) { ; GFX7-NEXT: v_min_f32_e32 v3, v0, v1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_vector_reduce_fminimum_v2half: @@ -133,6 +134,7 @@ define half @test_vector_reduce_fminimum_v3half(<3 x half> %v) { ; GFX7-NEXT: v_min_f32_e32 v2, v0, v1 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_vector_reduce_fminimum_v3half: @@ -279,6 +281,7 @@ define half @test_vector_reduce_fminimum_v4half(<4 x half> %v) { ; GFX7-NEXT: v_min_f32_e32 v1, v0, v3 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_vector_reduce_fminimum_v4half: @@ -462,6 +465,7 @@ define half @test_vector_reduce_fminimum_v8half(<8 x half> %v) { ; GFX7-NEXT: v_min_f32_e32 v1, v0, v4 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_vector_reduce_fminimum_v8half: @@ -766,6 +770,7 @@ define half @test_vector_reduce_fminimum_v16half(<16 x half> %v) { ; GFX7-NEXT: v_min_f32_e32 v1, v0, v8 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_vector_reduce_fminimum_v16half: diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll index 45fc82abb507e..7ea92e7b3582c 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll @@ -20,13 +20,13 @@ define half @test_vector_reduce_fmul_v2half(half %sp, <2 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmul_v2half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmul_v2half: @@ -158,15 +158,15 @@ define half @test_vector_reduce_fmul_v3half(half %sp, <3 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmul_v3half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmul_v3half: @@ -311,11 +311,10 @@ define half @test_vector_reduce_fmul_v4half(half %sp, <4 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmul_v4half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 @@ -323,6 +322,7 @@ define half @test_vector_reduce_fmul_v4half(half %sp, <4 x half> %v) { ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmul_v4half: @@ -499,11 +499,10 @@ define half @test_vector_reduce_fmul_v8half(half %sp, <8 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmul_v8half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7 @@ -521,6 +520,7 @@ define half @test_vector_reduce_fmul_v8half(half %sp, <8 x half> %v) { ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v6 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v5 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmul_v8half: @@ -787,11 +787,10 @@ define half @test_vector_reduce_fmul_v16half(half %sp, <16 x half> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_fmul_v16half: ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v16, 16, v1 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15 @@ -829,6 +828,7 @@ define half @test_vector_reduce_fmul_v16half(half %sp, <16 x half> %v) { ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v10 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v8 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v9 +; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_fmul_v16half: