diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 139e2d101a077..ed5988ee6efc3 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1111,11 +1111,8 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, EVT ScalarVT = VT.getScalarType(); unsigned Size = ScalarVT.getSizeInBits(); if (Size == 16) { - if (Subtarget->has16BitInsts()) { - if (VT.isInteger()) - return MVT::v2i16; - return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16); - } + if (Subtarget->has16BitInsts()) + return MVT::getVectorVT(ScalarVT.getSimpleVT(), 2); return VT.isInteger() ? MVT::i32 : MVT::f32; } @@ -1167,13 +1164,8 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( // support, but unless we can properly handle 3-vectors, it will be still be // inconsistent. if (Size == 16 && Subtarget->has16BitInsts()) { - if (ScalarVT == MVT::bf16) { - RegisterVT = MVT::i32; - IntermediateVT = MVT::v2bf16; - } else { - RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16; - IntermediateVT = RegisterVT; - } + RegisterVT = MVT::getVectorVT(ScalarVT.getSimpleVT(), 2); + IntermediateVT = RegisterVT; NumIntermediates = (NumElts + 1) / 2; return NumIntermediates; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslate-bf16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslate-bf16.ll index 3206f8e55f44e..0213bb35cc3a2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslate-bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslate-bf16.ll @@ -8,24 +8,20 @@ define <3 x bfloat> @v3bf16(<3 x bfloat> %arg0) { ; GFX9: bb.1 (%ir-block.0): ; GFX9-NEXT: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16) - ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32) - ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16) - ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<3 x s16>) = G_TRUNC [[BUILD_VECTOR]](<3 x s32>) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>) + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16) - ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<3 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<3 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2) + ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<3 x s16>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR]](<3 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2) ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<3 x s16>) - ; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16) - ; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16) - ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT4]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT5]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[UV4]](s16), [[UV5]](s16), [[UV6]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR2]](<4 x s16>) + ; GFX9-NEXT: $vgpr0 = COPY [[UV7]](<2 x s16>) + ; GFX9-NEXT: $vgpr1 = COPY [[UV8]](<2 x s16>) ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 %res = shufflevector <3 x bfloat> %arg0, <3 x bfloat> zeroinitializer, <3 x i32> ret <3 x bfloat> %res @@ -36,24 +32,15 @@ define <4 x bfloat> @v4bf16(<4 x bfloat> %arg0) { ; GFX9: bb.1 (%ir-block.0): ; GFX9-NEXT: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16) - ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32) - ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16) - ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000 - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16) - ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<4 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0) - ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<4 x s16>) - ; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16) - ; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16) - ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT4]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT5]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16) + ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s16>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<4 x s16>), [[BUILD_VECTOR]], shufflemask(3, 1, 2, 0) + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[SHUF]](<4 x s16>) + ; GFX9-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) + ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 %res = shufflevector <4 x bfloat> %arg0, <4 x bfloat> zeroinitializer, <4 x i32> ret <4 x bfloat> %res @@ -64,30 +51,22 @@ define <5 x bfloat> @v5bf16(<5 x bfloat> %arg0) { ; GFX9: bb.1 (%ir-block.0): ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16) - ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32) - ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16) - ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16) - ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32) - ; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16) - ; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<5 x s16>) = G_TRUNC [[BUILD_VECTOR]](<5 x s32>) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>) + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<5 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16) - ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<5 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<5 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4) + ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<5 x s16>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR]](<5 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4) ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16), [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<5 x s16>) - ; GFX9-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16) - ; GFX9-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16) - ; GFX9-NEXT: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16) - ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT6]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT7]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[ANYEXT8]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<6 x s16>) = G_BUILD_VECTOR [[UV6]](s16), [[UV7]](s16), [[UV8]](s16), [[UV9]](s16), [[UV10]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[UV11:%[0-9]+]]:_(<2 x s16>), [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR2]](<6 x s16>) + ; GFX9-NEXT: $vgpr0 = COPY [[UV11]](<2 x s16>) + ; GFX9-NEXT: $vgpr1 = COPY [[UV12]](<2 x s16>) + ; GFX9-NEXT: $vgpr2 = COPY [[UV13]](<2 x s16>) ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 %res = shufflevector <5 x bfloat> %arg0, <5 x bfloat> zeroinitializer, <5 x i32> ret <5 x bfloat> %res @@ -98,30 +77,17 @@ define <6 x bfloat> @v6bf16(<6 x bfloat> %arg0) { ; GFX9: bb.1 (%ir-block.0): ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16) - ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32) - ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16) - ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16) - ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32) - ; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16) - ; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<6 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<6 x s16>) = G_TRUNC [[BUILD_VECTOR]](<6 x s32>) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000 - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<6 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16) - ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<6 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<6 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4, 5) - ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16), [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<6 x s16>) - ; GFX9-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16) - ; GFX9-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16) - ; GFX9-NEXT: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16) - ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT6]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT7]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[ANYEXT8]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<6 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16) + ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<6 x s16>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<6 x s16>), [[BUILD_VECTOR]], shufflemask(3, 1, 2, 0, 4, 5) + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[SHUF]](<6 x s16>) + ; GFX9-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) + ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) + ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](<2 x s16>) ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 %res = shufflevector <6 x bfloat> %arg0, <6 x bfloat> zeroinitializer, <6 x i32> ret <6 x bfloat> %res @@ -132,36 +98,24 @@ define <7 x bfloat> @v7bf16(<7 x bfloat> %arg0) { ; GFX9: bb.1 (%ir-block.0): ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16) - ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32) - ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16) - ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16) - ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32) - ; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16) - ; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16) - ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY3]](s32) - ; GFX9-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16) - ; GFX9-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<7 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32), [[ANYEXT6]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<7 x s16>) = G_TRUNC [[BUILD_VECTOR]](<7 x s32>) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>) + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<7 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16), [[UV6]](s16) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<7 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16) - ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<7 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<7 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4, 5, 6) + ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<7 x s16>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR]](<7 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4, 5, 6) ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16), [[UV12:%[0-9]+]]:_(s16), [[UV13:%[0-9]+]]:_(s16), [[UV14:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<7 x s16>) - ; GFX9-NEXT: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16) - ; GFX9-NEXT: [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s16) - ; GFX9-NEXT: [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s16) - ; GFX9-NEXT: [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s16) - ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT8]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT9]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[ANYEXT10]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[ANYEXT11]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[UV8]](s16), [[UV9]](s16), [[UV10]](s16), [[UV11]](s16), [[UV12]](s16), [[UV13]](s16), [[UV14]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[UV15:%[0-9]+]]:_(<2 x s16>), [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>), [[UV18:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR2]](<8 x s16>) + ; GFX9-NEXT: $vgpr0 = COPY [[UV15]](<2 x s16>) + ; GFX9-NEXT: $vgpr1 = COPY [[UV16]](<2 x s16>) + ; GFX9-NEXT: $vgpr2 = COPY [[UV17]](<2 x s16>) + ; GFX9-NEXT: $vgpr3 = COPY [[UV18]](<2 x s16>) ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 %res = shufflevector <7 x bfloat> %arg0, <7 x bfloat> zeroinitializer, <7 x i32> ret <7 x bfloat> %res @@ -172,36 +126,19 @@ define <8 x bfloat> @v8bf16(<8 x bfloat> %arg0) { ; GFX9: bb.1 (%ir-block.0): ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16) - ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32) - ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16) - ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16) - ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32) - ; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16) - ; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16) - ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY3]](s32) - ; GFX9-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16) - ; GFX9-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32), [[ANYEXT6]](s32), [[ANYEXT7]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<8 x s16>) = G_TRUNC [[BUILD_VECTOR]](<8 x s32>) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000 - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16) - ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<8 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<8 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4, 5, 6, 7) - ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16), [[UV12:%[0-9]+]]:_(s16), [[UV13:%[0-9]+]]:_(s16), [[UV14:%[0-9]+]]:_(s16), [[UV15:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<8 x s16>) - ; GFX9-NEXT: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16) - ; GFX9-NEXT: [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s16) - ; GFX9-NEXT: [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s16) - ; GFX9-NEXT: [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s16) - ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT8]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT9]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[ANYEXT10]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[ANYEXT11]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16) + ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<8 x s16>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<8 x s16>), [[BUILD_VECTOR]], shufflemask(3, 1, 2, 0, 4, 5, 6, 7) + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[SHUF]](<8 x s16>) + ; GFX9-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) + ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) + ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](<2 x s16>) + ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](<2 x s16>) ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 %res = shufflevector <8 x bfloat> %arg0, <8 x bfloat> zeroinitializer, <8 x i32> ret <8 x bfloat> %res @@ -212,57 +149,24 @@ define <16 x bfloat> @v16bf16(<16 x bfloat> %arg0) { ; GFX9: bb.1 (%ir-block.0): ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16) - ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32) - ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16) - ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16) - ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32) - ; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16) - ; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16) - ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY3]](s32) - ; GFX9-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16) - ; GFX9-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16) - ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY4]](s32) - ; GFX9-NEXT: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16) - ; GFX9-NEXT: [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s16) - ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY5]](s32) - ; GFX9-NEXT: [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s16) - ; GFX9-NEXT: [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s16) - ; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s16), [[UV13:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY6]](s32) - ; GFX9-NEXT: [[ANYEXT12:%[0-9]+]]:_(s32) = G_ANYEXT [[UV12]](s16) - ; GFX9-NEXT: [[ANYEXT13:%[0-9]+]]:_(s32) = G_ANYEXT [[UV13]](s16) - ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s16), [[UV15:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY7]](s32) - ; GFX9-NEXT: [[ANYEXT14:%[0-9]+]]:_(s32) = G_ANYEXT [[UV14]](s16) - ; GFX9-NEXT: [[ANYEXT15:%[0-9]+]]:_(s32) = G_ANYEXT [[UV15]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32), [[ANYEXT6]](s32), [[ANYEXT7]](s32), [[ANYEXT8]](s32), [[ANYEXT9]](s32), [[ANYEXT10]](s32), [[ANYEXT11]](s32), [[ANYEXT12]](s32), [[ANYEXT13]](s32), [[ANYEXT14]](s32), [[ANYEXT15]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<16 x s16>) = G_TRUNC [[BUILD_VECTOR]](<16 x s32>) - ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s16), [[UV17:%[0-9]+]]:_(s16), [[UV18:%[0-9]+]]:_(s16), [[UV19:%[0-9]+]]:_(s16), [[UV20:%[0-9]+]]:_(s16), [[UV21:%[0-9]+]]:_(s16), [[UV22:%[0-9]+]]:_(s16), [[UV23:%[0-9]+]]:_(s16), [[UV24:%[0-9]+]]:_(s16), [[UV25:%[0-9]+]]:_(s16), [[UV26:%[0-9]+]]:_(s16), [[UV27:%[0-9]+]]:_(s16), [[UV28:%[0-9]+]]:_(s16), [[UV29:%[0-9]+]]:_(s16), [[UV30:%[0-9]+]]:_(s16), [[UV31:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[TRUNC]](<16 x s16>) - ; GFX9-NEXT: [[ANYEXT16:%[0-9]+]]:_(s32) = G_ANYEXT [[UV16]](s16) - ; GFX9-NEXT: [[ANYEXT17:%[0-9]+]]:_(s32) = G_ANYEXT [[UV17]](s16) - ; GFX9-NEXT: [[ANYEXT18:%[0-9]+]]:_(s32) = G_ANYEXT [[UV18]](s16) - ; GFX9-NEXT: [[ANYEXT19:%[0-9]+]]:_(s32) = G_ANYEXT [[UV19]](s16) - ; GFX9-NEXT: [[ANYEXT20:%[0-9]+]]:_(s32) = G_ANYEXT [[UV20]](s16) - ; GFX9-NEXT: [[ANYEXT21:%[0-9]+]]:_(s32) = G_ANYEXT [[UV21]](s16) - ; GFX9-NEXT: [[ANYEXT22:%[0-9]+]]:_(s32) = G_ANYEXT [[UV22]](s16) - ; GFX9-NEXT: [[ANYEXT23:%[0-9]+]]:_(s32) = G_ANYEXT [[UV23]](s16) - ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT16]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT17]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[ANYEXT18]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[ANYEXT19]](s32) - ; GFX9-NEXT: $vgpr4 = COPY [[ANYEXT20]](s32) - ; GFX9-NEXT: $vgpr5 = COPY [[ANYEXT21]](s32) - ; GFX9-NEXT: $vgpr6 = COPY [[ANYEXT22]](s32) - ; GFX9-NEXT: $vgpr7 = COPY [[ANYEXT23]](s32) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 + ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr6 + ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr7 + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[COPY6]](<2 x s16>), [[COPY7]](<2 x s16>) + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>), [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s16>) + ; GFX9-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) + ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) + ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](<2 x s16>) + ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](<2 x s16>) + ; GFX9-NEXT: $vgpr4 = COPY [[UV4]](<2 x s16>) + ; GFX9-NEXT: $vgpr5 = COPY [[UV5]](<2 x s16>) + ; GFX9-NEXT: $vgpr6 = COPY [[UV6]](<2 x s16>) + ; GFX9-NEXT: $vgpr7 = COPY [[UV7]](<2 x s16>) ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 ret <16 x bfloat> %arg0 } @@ -272,105 +176,40 @@ define <32 x bfloat> @v32bf16(<32 x bfloat> %arg0) { ; GFX9: bb.1 (%ir-block.0): ; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 - ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 - ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 - ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16) - ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32) - ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16) - ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16) - ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32) - ; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16) - ; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16) - ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY3]](s32) - ; GFX9-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16) - ; GFX9-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16) - ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY4]](s32) - ; GFX9-NEXT: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16) - ; GFX9-NEXT: [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s16) - ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY5]](s32) - ; GFX9-NEXT: [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s16) - ; GFX9-NEXT: [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s16) - ; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s16), [[UV13:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY6]](s32) - ; GFX9-NEXT: [[ANYEXT12:%[0-9]+]]:_(s32) = G_ANYEXT [[UV12]](s16) - ; GFX9-NEXT: [[ANYEXT13:%[0-9]+]]:_(s32) = G_ANYEXT [[UV13]](s16) - ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s16), [[UV15:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY7]](s32) - ; GFX9-NEXT: [[ANYEXT14:%[0-9]+]]:_(s32) = G_ANYEXT [[UV14]](s16) - ; GFX9-NEXT: [[ANYEXT15:%[0-9]+]]:_(s32) = G_ANYEXT [[UV15]](s16) - ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s16), [[UV17:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY8]](s32) - ; GFX9-NEXT: [[ANYEXT16:%[0-9]+]]:_(s32) = G_ANYEXT [[UV16]](s16) - ; GFX9-NEXT: [[ANYEXT17:%[0-9]+]]:_(s32) = G_ANYEXT [[UV17]](s16) - ; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s16), [[UV19:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY9]](s32) - ; GFX9-NEXT: [[ANYEXT18:%[0-9]+]]:_(s32) = G_ANYEXT [[UV18]](s16) - ; GFX9-NEXT: [[ANYEXT19:%[0-9]+]]:_(s32) = G_ANYEXT [[UV19]](s16) - ; GFX9-NEXT: [[UV20:%[0-9]+]]:_(s16), [[UV21:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY10]](s32) - ; GFX9-NEXT: [[ANYEXT20:%[0-9]+]]:_(s32) = G_ANYEXT [[UV20]](s16) - ; GFX9-NEXT: [[ANYEXT21:%[0-9]+]]:_(s32) = G_ANYEXT [[UV21]](s16) - ; GFX9-NEXT: [[UV22:%[0-9]+]]:_(s16), [[UV23:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY11]](s32) - ; GFX9-NEXT: [[ANYEXT22:%[0-9]+]]:_(s32) = G_ANYEXT [[UV22]](s16) - ; GFX9-NEXT: [[ANYEXT23:%[0-9]+]]:_(s32) = G_ANYEXT [[UV23]](s16) - ; GFX9-NEXT: [[UV24:%[0-9]+]]:_(s16), [[UV25:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY12]](s32) - ; GFX9-NEXT: [[ANYEXT24:%[0-9]+]]:_(s32) = G_ANYEXT [[UV24]](s16) - ; GFX9-NEXT: [[ANYEXT25:%[0-9]+]]:_(s32) = G_ANYEXT [[UV25]](s16) - ; GFX9-NEXT: [[UV26:%[0-9]+]]:_(s16), [[UV27:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY13]](s32) - ; GFX9-NEXT: [[ANYEXT26:%[0-9]+]]:_(s32) = G_ANYEXT [[UV26]](s16) - ; GFX9-NEXT: [[ANYEXT27:%[0-9]+]]:_(s32) = G_ANYEXT [[UV27]](s16) - ; GFX9-NEXT: [[UV28:%[0-9]+]]:_(s16), [[UV29:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY14]](s32) - ; GFX9-NEXT: [[ANYEXT28:%[0-9]+]]:_(s32) = G_ANYEXT [[UV28]](s16) - ; GFX9-NEXT: [[ANYEXT29:%[0-9]+]]:_(s32) = G_ANYEXT [[UV29]](s16) - ; GFX9-NEXT: [[UV30:%[0-9]+]]:_(s16), [[UV31:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY15]](s32) - ; GFX9-NEXT: [[ANYEXT30:%[0-9]+]]:_(s32) = G_ANYEXT [[UV30]](s16) - ; GFX9-NEXT: [[ANYEXT31:%[0-9]+]]:_(s32) = G_ANYEXT [[UV31]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32), [[ANYEXT6]](s32), [[ANYEXT7]](s32), [[ANYEXT8]](s32), [[ANYEXT9]](s32), [[ANYEXT10]](s32), [[ANYEXT11]](s32), [[ANYEXT12]](s32), [[ANYEXT13]](s32), [[ANYEXT14]](s32), [[ANYEXT15]](s32), [[ANYEXT16]](s32), [[ANYEXT17]](s32), [[ANYEXT18]](s32), [[ANYEXT19]](s32), [[ANYEXT20]](s32), [[ANYEXT21]](s32), [[ANYEXT22]](s32), [[ANYEXT23]](s32), [[ANYEXT24]](s32), [[ANYEXT25]](s32), [[ANYEXT26]](s32), [[ANYEXT27]](s32), [[ANYEXT28]](s32), [[ANYEXT29]](s32), [[ANYEXT30]](s32), [[ANYEXT31]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<32 x s16>) = G_TRUNC [[BUILD_VECTOR]](<32 x s32>) - ; GFX9-NEXT: [[UV32:%[0-9]+]]:_(s16), [[UV33:%[0-9]+]]:_(s16), [[UV34:%[0-9]+]]:_(s16), [[UV35:%[0-9]+]]:_(s16), [[UV36:%[0-9]+]]:_(s16), [[UV37:%[0-9]+]]:_(s16), [[UV38:%[0-9]+]]:_(s16), [[UV39:%[0-9]+]]:_(s16), [[UV40:%[0-9]+]]:_(s16), [[UV41:%[0-9]+]]:_(s16), [[UV42:%[0-9]+]]:_(s16), [[UV43:%[0-9]+]]:_(s16), [[UV44:%[0-9]+]]:_(s16), [[UV45:%[0-9]+]]:_(s16), [[UV46:%[0-9]+]]:_(s16), [[UV47:%[0-9]+]]:_(s16), [[UV48:%[0-9]+]]:_(s16), [[UV49:%[0-9]+]]:_(s16), [[UV50:%[0-9]+]]:_(s16), [[UV51:%[0-9]+]]:_(s16), [[UV52:%[0-9]+]]:_(s16), [[UV53:%[0-9]+]]:_(s16), [[UV54:%[0-9]+]]:_(s16), [[UV55:%[0-9]+]]:_(s16), [[UV56:%[0-9]+]]:_(s16), [[UV57:%[0-9]+]]:_(s16), [[UV58:%[0-9]+]]:_(s16), [[UV59:%[0-9]+]]:_(s16), [[UV60:%[0-9]+]]:_(s16), [[UV61:%[0-9]+]]:_(s16), [[UV62:%[0-9]+]]:_(s16), [[UV63:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[TRUNC]](<32 x s16>) - ; GFX9-NEXT: [[ANYEXT32:%[0-9]+]]:_(s32) = G_ANYEXT [[UV32]](s16) - ; GFX9-NEXT: [[ANYEXT33:%[0-9]+]]:_(s32) = G_ANYEXT [[UV33]](s16) - ; GFX9-NEXT: [[ANYEXT34:%[0-9]+]]:_(s32) = G_ANYEXT [[UV34]](s16) - ; GFX9-NEXT: [[ANYEXT35:%[0-9]+]]:_(s32) = G_ANYEXT [[UV35]](s16) - ; GFX9-NEXT: [[ANYEXT36:%[0-9]+]]:_(s32) = G_ANYEXT [[UV36]](s16) - ; GFX9-NEXT: [[ANYEXT37:%[0-9]+]]:_(s32) = G_ANYEXT [[UV37]](s16) - ; GFX9-NEXT: [[ANYEXT38:%[0-9]+]]:_(s32) = G_ANYEXT [[UV38]](s16) - ; GFX9-NEXT: [[ANYEXT39:%[0-9]+]]:_(s32) = G_ANYEXT [[UV39]](s16) - ; GFX9-NEXT: [[ANYEXT40:%[0-9]+]]:_(s32) = G_ANYEXT [[UV40]](s16) - ; GFX9-NEXT: [[ANYEXT41:%[0-9]+]]:_(s32) = G_ANYEXT [[UV41]](s16) - ; GFX9-NEXT: [[ANYEXT42:%[0-9]+]]:_(s32) = G_ANYEXT [[UV42]](s16) - ; GFX9-NEXT: [[ANYEXT43:%[0-9]+]]:_(s32) = G_ANYEXT [[UV43]](s16) - ; GFX9-NEXT: [[ANYEXT44:%[0-9]+]]:_(s32) = G_ANYEXT [[UV44]](s16) - ; GFX9-NEXT: [[ANYEXT45:%[0-9]+]]:_(s32) = G_ANYEXT [[UV45]](s16) - ; GFX9-NEXT: [[ANYEXT46:%[0-9]+]]:_(s32) = G_ANYEXT [[UV46]](s16) - ; GFX9-NEXT: [[ANYEXT47:%[0-9]+]]:_(s32) = G_ANYEXT [[UV47]](s16) - ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT32]](s32) - ; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT33]](s32) - ; GFX9-NEXT: $vgpr2 = COPY [[ANYEXT34]](s32) - ; GFX9-NEXT: $vgpr3 = COPY [[ANYEXT35]](s32) - ; GFX9-NEXT: $vgpr4 = COPY [[ANYEXT36]](s32) - ; GFX9-NEXT: $vgpr5 = COPY [[ANYEXT37]](s32) - ; GFX9-NEXT: $vgpr6 = COPY [[ANYEXT38]](s32) - ; GFX9-NEXT: $vgpr7 = COPY [[ANYEXT39]](s32) - ; GFX9-NEXT: $vgpr8 = COPY [[ANYEXT40]](s32) - ; GFX9-NEXT: $vgpr9 = COPY [[ANYEXT41]](s32) - ; GFX9-NEXT: $vgpr10 = COPY [[ANYEXT42]](s32) - ; GFX9-NEXT: $vgpr11 = COPY [[ANYEXT43]](s32) - ; GFX9-NEXT: $vgpr12 = COPY [[ANYEXT44]](s32) - ; GFX9-NEXT: $vgpr13 = COPY [[ANYEXT45]](s32) - ; GFX9-NEXT: $vgpr14 = COPY [[ANYEXT46]](s32) - ; GFX9-NEXT: $vgpr15 = COPY [[ANYEXT47]](s32) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 + ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr6 + ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr7 + ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr8 + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr9 + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr10 + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr11 + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr12 + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr13 + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr14 + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr15 + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<32 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[COPY6]](<2 x s16>), [[COPY7]](<2 x s16>), [[COPY8]](<2 x s16>), [[COPY9]](<2 x s16>), [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[COPY12]](<2 x s16>), [[COPY13]](<2 x s16>), [[COPY14]](<2 x s16>), [[COPY15]](<2 x s16>) + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>), [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>), [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>), [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>), [[UV14:%[0-9]+]]:_(<2 x s16>), [[UV15:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<32 x s16>) + ; GFX9-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) + ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) + ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](<2 x s16>) + ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](<2 x s16>) + ; GFX9-NEXT: $vgpr4 = COPY [[UV4]](<2 x s16>) + ; GFX9-NEXT: $vgpr5 = COPY [[UV5]](<2 x s16>) + ; GFX9-NEXT: $vgpr6 = COPY [[UV6]](<2 x s16>) + ; GFX9-NEXT: $vgpr7 = COPY [[UV7]](<2 x s16>) + ; GFX9-NEXT: $vgpr8 = COPY [[UV8]](<2 x s16>) + ; GFX9-NEXT: $vgpr9 = COPY [[UV9]](<2 x s16>) + ; GFX9-NEXT: $vgpr10 = COPY [[UV10]](<2 x s16>) + ; GFX9-NEXT: $vgpr11 = COPY [[UV11]](<2 x s16>) + ; GFX9-NEXT: $vgpr12 = COPY [[UV12]](<2 x s16>) + ; GFX9-NEXT: $vgpr13 = COPY [[UV13]](<2 x s16>) + ; GFX9-NEXT: $vgpr14 = COPY [[UV14]](<2 x s16>) + ; GFX9-NEXT: $vgpr15 = COPY [[UV15]](<2 x s16>) ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 ret <32 x bfloat> %arg0 } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll index 644ef05863abd..0a3d4ea42b705 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll @@ -3021,10 +3021,9 @@ define void @void_func_v2bf16_inreg(<2 x bfloat> inreg %arg0) #0 { ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: liveins: $sgpr16 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr16 - ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY]](s32) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $sgpr16 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; CHECK-NEXT: G_STORE [[BITCAST]](<2 x s16>), [[DEF]](p1) :: (store (<2 x s16>) into `ptr addrspace(1) poison`, addrspace 1) + ; CHECK-NEXT: G_STORE [[COPY]](<2 x s16>), [[DEF]](p1) :: (store (<2 x s16>) into `ptr addrspace(1) poison`, addrspace 1) ; CHECK-NEXT: SI_RETURN store <2 x bfloat> %arg0, ptr addrspace(1) poison ret void diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 38caab94a2819..f752cea3526af 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -54830,24 +54830,24 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf ; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v33 ; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 ; GFX8-NEXT: v_fma_f32 v2, v2, v18, v33 -; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 ; GFX8-NEXT: v_fma_f32 v19, v35, v34, v19 ; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v17 ; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v1 ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v33 -; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX8-NEXT: v_fma_f32 v1, v1, v17, v33 +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v18 +; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX8-NEXT: v_fma_f32 v1, v1, v17, v18 ; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4 -; GFX8-NEXT: v_fma_f32 v18, v35, v34, v18 +; GFX8-NEXT: v_fma_f32 v33, v35, v34, v33 ; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v16 ; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v0 ; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v17 +; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17 ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX8-NEXT: v_fma_f32 v0, v0, v16, v17 ; GFX8-NEXT: v_bfe_u32 v16, v31, 16, 1 @@ -54861,181 +54861,181 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf ; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; GFX8-NEXT: v_or_b32_e32 v15, 0x400000, v15 -; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v15, vcc -; GFX8-NEXT: v_bfe_u32 v17, v32, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v32 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v15, vcc +; GFX8-NEXT: v_bfe_u32 v15, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v32 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15 +; GFX8-NEXT: v_fma_f32 v34, v35, v34, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX8-NEXT: v_or_b32_e32 v31, 0x400000, v32 -; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v31, vcc -; GFX8-NEXT: v_bfe_u32 v31, v14, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v14 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v32 +; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v18, vcc +; GFX8-NEXT: v_bfe_u32 v18, v14, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v14 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX8-NEXT: v_or_b32_e32 v14, 0x400000, v14 -; GFX8-NEXT: v_cndmask_b32_e32 v14, v31, v14, vcc -; GFX8-NEXT: v_bfe_u32 v31, v30, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v30 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v14, vcc +; GFX8-NEXT: v_bfe_u32 v14, v30, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v30 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; GFX8-NEXT: v_or_b32_e32 v30, 0x400000, v30 -; GFX8-NEXT: v_cndmask_b32_e32 v30, v31, v30, vcc -; GFX8-NEXT: v_bfe_u32 v31, v13, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v13 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc +; GFX8-NEXT: v_bfe_u32 v30, v13, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v13 +; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX8-NEXT: v_or_b32_e32 v13, 0x400000, v13 -; GFX8-NEXT: v_cndmask_b32_e32 v13, v31, v13, vcc -; GFX8-NEXT: v_bfe_u32 v31, v29, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v29 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cndmask_b32_e32 v30, v30, v13, vcc +; GFX8-NEXT: v_bfe_u32 v13, v29, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v29 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; GFX8-NEXT: v_or_b32_e32 v29, 0x400000, v29 -; GFX8-NEXT: v_cndmask_b32_e32 v29, v31, v29, vcc -; GFX8-NEXT: v_bfe_u32 v31, v12, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v12 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc +; GFX8-NEXT: v_bfe_u32 v29, v12, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v12 +; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX8-NEXT: v_or_b32_e32 v12, 0x400000, v12 -; GFX8-NEXT: v_cndmask_b32_e32 v12, v31, v12, vcc -; GFX8-NEXT: v_bfe_u32 v31, v28, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v28 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cndmask_b32_e32 v29, v29, v12, vcc +; GFX8-NEXT: v_bfe_u32 v12, v28, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v28 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; GFX8-NEXT: v_or_b32_e32 v28, 0x400000, v28 -; GFX8-NEXT: v_cndmask_b32_e32 v28, v31, v28, vcc -; GFX8-NEXT: v_bfe_u32 v31, v11, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v11 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc +; GFX8-NEXT: v_bfe_u32 v28, v11, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v11 +; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v11, v31, v11, vcc -; GFX8-NEXT: v_bfe_u32 v31, v27, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v27 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cndmask_b32_e32 v28, v28, v11, vcc +; GFX8-NEXT: v_bfe_u32 v11, v27, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v27 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; GFX8-NEXT: v_or_b32_e32 v27, 0x400000, v27 -; GFX8-NEXT: v_cndmask_b32_e32 v27, v31, v27, vcc -; GFX8-NEXT: v_bfe_u32 v31, v10, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v10 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc +; GFX8-NEXT: v_bfe_u32 v27, v10, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v10 +; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 -; GFX8-NEXT: v_cndmask_b32_e32 v10, v31, v10, vcc -; GFX8-NEXT: v_bfe_u32 v31, v26, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v26 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cndmask_b32_e32 v27, v27, v10, vcc +; GFX8-NEXT: v_bfe_u32 v10, v26, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v26 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; GFX8-NEXT: v_or_b32_e32 v26, 0x400000, v26 -; GFX8-NEXT: v_cndmask_b32_e32 v26, v31, v26, vcc -; GFX8-NEXT: v_bfe_u32 v31, v9, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v9 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc +; GFX8-NEXT: v_bfe_u32 v26, v9, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v9 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v31, v9, vcc -; GFX8-NEXT: v_bfe_u32 v31, v25, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v25 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v26, v9, vcc +; GFX8-NEXT: v_bfe_u32 v26, v25, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v25 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; GFX8-NEXT: v_or_b32_e32 v25, 0x400000, v25 -; GFX8-NEXT: v_cndmask_b32_e32 v25, v31, v25, vcc -; GFX8-NEXT: v_bfe_u32 v31, v8, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v8 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cndmask_b32_e32 v25, v26, v25, vcc +; GFX8-NEXT: v_bfe_u32 v26, v8, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v8 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v31, v8, vcc -; GFX8-NEXT: v_bfe_u32 v31, v24, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v24 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v26, v8, vcc +; GFX8-NEXT: v_bfe_u32 v26, v24, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v24 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; GFX8-NEXT: v_or_b32_e32 v24, 0x400000, v24 -; GFX8-NEXT: v_cndmask_b32_e32 v24, v31, v24, vcc -; GFX8-NEXT: v_bfe_u32 v31, v7, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v7 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cndmask_b32_e32 v24, v26, v24, vcc +; GFX8-NEXT: v_bfe_u32 v26, v7, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v7 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v31, v7, vcc -; GFX8-NEXT: v_bfe_u32 v31, v23, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v23 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc +; GFX8-NEXT: v_bfe_u32 v26, v23, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v23 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; GFX8-NEXT: v_or_b32_e32 v23, 0x400000, v23 -; GFX8-NEXT: v_cndmask_b32_e32 v23, v31, v23, vcc -; GFX8-NEXT: v_bfe_u32 v31, v6, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v6 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cndmask_b32_e32 v23, v26, v23, vcc +; GFX8-NEXT: v_bfe_u32 v26, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v6 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v31, v6, vcc -; GFX8-NEXT: v_bfe_u32 v31, v22, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v22 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v26, v6, vcc +; GFX8-NEXT: v_bfe_u32 v26, v22, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v22 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; GFX8-NEXT: v_or_b32_e32 v22, 0x400000, v22 -; GFX8-NEXT: v_cndmask_b32_e32 v22, v31, v22, vcc -; GFX8-NEXT: v_bfe_u32 v31, v5, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v5 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cndmask_b32_e32 v22, v26, v22, vcc +; GFX8-NEXT: v_bfe_u32 v26, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v5 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v31, v5, vcc -; GFX8-NEXT: v_bfe_u32 v31, v21, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v21 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v26, v5, vcc +; GFX8-NEXT: v_bfe_u32 v26, v21, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v21 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; GFX8-NEXT: v_or_b32_e32 v21, 0x400000, v21 -; GFX8-NEXT: v_cndmask_b32_e32 v21, v31, v21, vcc -; GFX8-NEXT: v_bfe_u32 v31, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v4 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cndmask_b32_e32 v21, v26, v21, vcc +; GFX8-NEXT: v_bfe_u32 v26, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v4 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v31, v4, vcc -; GFX8-NEXT: v_bfe_u32 v31, v20, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v20 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc +; GFX8-NEXT: v_bfe_u32 v26, v20, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v20 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; GFX8-NEXT: v_or_b32_e32 v20, 0x400000, v20 -; GFX8-NEXT: v_cndmask_b32_e32 v20, v31, v20, vcc -; GFX8-NEXT: v_bfe_u32 v31, v3, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v3 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cndmask_b32_e32 v20, v26, v20, vcc +; GFX8-NEXT: v_bfe_u32 v26, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v3 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v31, v3, vcc -; GFX8-NEXT: v_bfe_u32 v31, v19, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v19 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v26, v3, vcc +; GFX8-NEXT: v_bfe_u32 v26, v19, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v19 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; GFX8-NEXT: v_or_b32_e32 v19, 0x400000, v19 -; GFX8-NEXT: v_cndmask_b32_e32 v19, v31, v19, vcc -; GFX8-NEXT: v_bfe_u32 v31, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v2 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cndmask_b32_e32 v19, v26, v19, vcc +; GFX8-NEXT: v_bfe_u32 v26, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v2 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v31, v2, vcc -; GFX8-NEXT: v_bfe_u32 v31, v18, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v18 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 -; GFX8-NEXT: v_cndmask_b32_e32 v18, v31, v18, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v26, v2, vcc +; GFX8-NEXT: v_bfe_u32 v26, v33, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v33 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX8-NEXT: v_or_b32_e32 v31, 0x400000, v33 +; GFX8-NEXT: v_cndmask_b32_e32 v26, v26, v31, vcc ; GFX8-NEXT: v_bfe_u32 v31, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v1 ; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 -; GFX8-NEXT: v_fma_f32 v33, v35, v34, v33 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v31, v1, vcc -; GFX8-NEXT: v_bfe_u32 v31, v33, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v33 +; GFX8-NEXT: v_bfe_u32 v31, v34, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v34 ; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v34 ; GFX8-NEXT: v_cndmask_b32_e32 v31, v31, v32, vcc ; GFX8-NEXT: v_bfe_u32 v32, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v0 @@ -55043,38 +55043,38 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_or_b32_e32 v0, 0x400000, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v31, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16 ; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16 ; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16 ; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v30 +; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v29 +; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v28 +; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v27 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX8-NEXT: v_alignbit_b32 v0, v0, v31, 16 +; GFX8-NEXT: v_alignbit_b32 v1, v1, v26, 16 ; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16 ; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16 ; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16 -; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16 -; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16 -; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16 -; GFX8-NEXT: v_alignbit_b32 v13, v13, v30, 16 -; GFX8-NEXT: v_alignbit_b32 v14, v14, v17, 16 -; GFX8-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; GFX8-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; GFX8-NEXT: v_alignbit_b32 v10, v22, v11, 16 +; GFX8-NEXT: v_alignbit_b32 v11, v21, v12, 16 +; GFX8-NEXT: v_alignbit_b32 v12, v20, v13, 16 +; GFX8-NEXT: v_alignbit_b32 v13, v19, v14, 16 +; GFX8-NEXT: v_alignbit_b32 v14, v18, v15, 16 +; GFX8-NEXT: v_alignbit_b32 v15, v17, v16, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_fma_v32bf16: @@ -55223,189 +55223,189 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf ; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v33 ; GFX900-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 ; GFX900-NEXT: v_fma_f32 v2, v2, v18, v33 -; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX900-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 ; GFX900-NEXT: v_fma_f32 v19, v35, v34, v19 ; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v17 ; GFX900-NEXT: v_lshlrev_b32_e32 v35, 16, v1 ; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v33 -; GFX900-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX900-NEXT: v_fma_f32 v1, v1, v17, v33 -; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; GFX900-NEXT: v_fma_f32 v18, v35, v34, v18 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v18 +; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX900-NEXT: v_fma_f32 v1, v1, v17, v18 +; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4 +; GFX900-NEXT: v_fma_f32 v33, v35, v34, v33 ; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v16 ; GFX900-NEXT: v_lshlrev_b32_e32 v35, 16, v0 ; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v33 -; GFX900-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX900-NEXT: v_fma_f32 v0, v0, v16, v33 +; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX900-NEXT: v_fma_f32 v0, v0, v16, v17 ; GFX900-NEXT: v_bfe_u32 v16, v31, 16, 1 ; GFX900-NEXT: v_add3_u32 v16, v16, v31, s4 -; GFX900-NEXT: v_or_b32_e32 v31, 0x400000, v31 -; GFX900-NEXT: v_cndmask_b32_e32 v16, v16, v31, vcc -; GFX900-NEXT: v_bfe_u32 v31, v15, 16, 1 -; GFX900-NEXT: v_add3_u32 v31, v31, v15, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v31 +; GFX900-NEXT: v_cndmask_b32_e32 v16, v16, v17, vcc +; GFX900-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX900-NEXT: v_add3_u32 v17, v17, v15, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; GFX900-NEXT: v_or_b32_e32 v15, 0x400000, v15 -; GFX900-NEXT: v_cndmask_b32_e32 v15, v31, v15, vcc -; GFX900-NEXT: v_bfe_u32 v31, v32, 16, 1 -; GFX900-NEXT: v_add3_u32 v31, v31, v32, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v15, vcc +; GFX900-NEXT: v_bfe_u32 v17, v32, 16, 1 +; GFX900-NEXT: v_fma_f32 v34, v35, v34, v18 +; GFX900-NEXT: v_add3_u32 v17, v17, v32, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v32 -; GFX900-NEXT: v_cndmask_b32_e32 v31, v31, v32, vcc -; GFX900-NEXT: v_bfe_u32 v32, v14, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v14, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v32 +; GFX900-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; GFX900-NEXT: v_bfe_u32 v18, v14, 16, 1 +; GFX900-NEXT: v_add3_u32 v18, v18, v14, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; GFX900-NEXT: v_or_b32_e32 v14, 0x400000, v14 -; GFX900-NEXT: v_cndmask_b32_e32 v14, v32, v14, vcc -; GFX900-NEXT: v_bfe_u32 v32, v30, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v30, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v18, v14, vcc +; GFX900-NEXT: v_bfe_u32 v18, v30, 16, 1 +; GFX900-NEXT: v_add3_u32 v18, v18, v30, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; GFX900-NEXT: v_or_b32_e32 v30, 0x400000, v30 -; GFX900-NEXT: v_cndmask_b32_e32 v30, v32, v30, vcc -; GFX900-NEXT: v_bfe_u32 v32, v13, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v13, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v18, v18, v30, vcc +; GFX900-NEXT: v_bfe_u32 v30, v13, 16, 1 +; GFX900-NEXT: v_add3_u32 v30, v30, v13, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX900-NEXT: v_or_b32_e32 v13, 0x400000, v13 -; GFX900-NEXT: v_cndmask_b32_e32 v13, v32, v13, vcc -; GFX900-NEXT: v_bfe_u32 v32, v29, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v29, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v30, v13, vcc +; GFX900-NEXT: v_bfe_u32 v30, v29, 16, 1 +; GFX900-NEXT: v_add3_u32 v30, v30, v29, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; GFX900-NEXT: v_or_b32_e32 v29, 0x400000, v29 -; GFX900-NEXT: v_cndmask_b32_e32 v29, v32, v29, vcc -; GFX900-NEXT: v_bfe_u32 v32, v12, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v12, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v29, v30, v29, vcc +; GFX900-NEXT: v_bfe_u32 v30, v12, 16, 1 +; GFX900-NEXT: v_add3_u32 v30, v30, v12, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX900-NEXT: v_or_b32_e32 v12, 0x400000, v12 -; GFX900-NEXT: v_cndmask_b32_e32 v12, v32, v12, vcc -; GFX900-NEXT: v_bfe_u32 v32, v28, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v28, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v30, v12, vcc +; GFX900-NEXT: v_bfe_u32 v30, v28, 16, 1 +; GFX900-NEXT: v_add3_u32 v30, v30, v28, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; GFX900-NEXT: v_or_b32_e32 v28, 0x400000, v28 -; GFX900-NEXT: v_cndmask_b32_e32 v28, v32, v28, vcc -; GFX900-NEXT: v_bfe_u32 v32, v11, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v11, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v28, v30, v28, vcc +; GFX900-NEXT: v_bfe_u32 v30, v11, 16, 1 +; GFX900-NEXT: v_add3_u32 v30, v30, v11, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX900-NEXT: v_or_b32_e32 v11, 0x400000, v11 -; GFX900-NEXT: v_cndmask_b32_e32 v11, v32, v11, vcc -; GFX900-NEXT: v_bfe_u32 v32, v27, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v27, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v30, v11, vcc +; GFX900-NEXT: v_bfe_u32 v30, v27, 16, 1 +; GFX900-NEXT: v_add3_u32 v30, v30, v27, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; GFX900-NEXT: v_or_b32_e32 v27, 0x400000, v27 -; GFX900-NEXT: v_cndmask_b32_e32 v27, v32, v27, vcc -; GFX900-NEXT: v_bfe_u32 v32, v10, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v10, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v27, v30, v27, vcc +; GFX900-NEXT: v_bfe_u32 v30, v10, 16, 1 +; GFX900-NEXT: v_add3_u32 v30, v30, v10, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v10 -; GFX900-NEXT: v_cndmask_b32_e32 v10, v32, v10, vcc -; GFX900-NEXT: v_bfe_u32 v32, v26, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v26, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v30, v10, vcc +; GFX900-NEXT: v_bfe_u32 v30, v26, 16, 1 +; GFX900-NEXT: v_add3_u32 v30, v30, v26, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; GFX900-NEXT: v_or_b32_e32 v26, 0x400000, v26 -; GFX900-NEXT: v_cndmask_b32_e32 v26, v32, v26, vcc -; GFX900-NEXT: v_bfe_u32 v32, v9, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v9, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v26, v30, v26, vcc +; GFX900-NEXT: v_bfe_u32 v30, v9, 16, 1 +; GFX900-NEXT: v_add3_u32 v30, v30, v9, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v9 -; GFX900-NEXT: v_cndmask_b32_e32 v9, v32, v9, vcc -; GFX900-NEXT: v_bfe_u32 v32, v25, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v25, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v30, v9, vcc +; GFX900-NEXT: v_bfe_u32 v30, v25, 16, 1 +; GFX900-NEXT: v_add3_u32 v30, v30, v25, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; GFX900-NEXT: v_or_b32_e32 v25, 0x400000, v25 -; GFX900-NEXT: v_cndmask_b32_e32 v25, v32, v25, vcc -; GFX900-NEXT: v_bfe_u32 v32, v8, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v8, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v25, v30, v25, vcc +; GFX900-NEXT: v_bfe_u32 v30, v8, 16, 1 +; GFX900-NEXT: v_add3_u32 v30, v30, v8, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v8 -; GFX900-NEXT: v_cndmask_b32_e32 v8, v32, v8, vcc -; GFX900-NEXT: v_bfe_u32 v32, v24, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v24, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v30, v8, vcc +; GFX900-NEXT: v_bfe_u32 v30, v24, 16, 1 +; GFX900-NEXT: v_add3_u32 v30, v30, v24, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; GFX900-NEXT: v_or_b32_e32 v24, 0x400000, v24 -; GFX900-NEXT: v_cndmask_b32_e32 v24, v32, v24, vcc -; GFX900-NEXT: v_bfe_u32 v32, v7, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v7, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v24, v30, v24, vcc +; GFX900-NEXT: v_bfe_u32 v30, v7, 16, 1 +; GFX900-NEXT: v_add3_u32 v30, v30, v7, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v7 -; GFX900-NEXT: v_cndmask_b32_e32 v7, v32, v7, vcc -; GFX900-NEXT: v_bfe_u32 v32, v23, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v23, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v30, v7, vcc +; GFX900-NEXT: v_bfe_u32 v30, v23, 16, 1 +; GFX900-NEXT: v_add3_u32 v30, v30, v23, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; GFX900-NEXT: v_or_b32_e32 v23, 0x400000, v23 -; GFX900-NEXT: v_cndmask_b32_e32 v23, v32, v23, vcc -; GFX900-NEXT: v_bfe_u32 v32, v6, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v6, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v23, v30, v23, vcc +; GFX900-NEXT: v_bfe_u32 v30, v6, 16, 1 +; GFX900-NEXT: v_add3_u32 v30, v30, v6, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v6 -; GFX900-NEXT: v_cndmask_b32_e32 v6, v32, v6, vcc -; GFX900-NEXT: v_bfe_u32 v32, v22, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v22, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v30, v6, vcc +; GFX900-NEXT: v_bfe_u32 v30, v22, 16, 1 +; GFX900-NEXT: v_add3_u32 v30, v30, v22, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; GFX900-NEXT: v_or_b32_e32 v22, 0x400000, v22 -; GFX900-NEXT: v_cndmask_b32_e32 v22, v32, v22, vcc -; GFX900-NEXT: v_bfe_u32 v32, v5, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v5, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v22, v30, v22, vcc +; GFX900-NEXT: v_bfe_u32 v30, v5, 16, 1 +; GFX900-NEXT: v_add3_u32 v30, v30, v5, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v5 -; GFX900-NEXT: v_cndmask_b32_e32 v5, v32, v5, vcc -; GFX900-NEXT: v_bfe_u32 v32, v21, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v21, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v30, v5, vcc +; GFX900-NEXT: v_bfe_u32 v30, v21, 16, 1 +; GFX900-NEXT: v_add3_u32 v30, v30, v21, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; GFX900-NEXT: v_or_b32_e32 v21, 0x400000, v21 -; GFX900-NEXT: v_cndmask_b32_e32 v21, v32, v21, vcc -; GFX900-NEXT: v_bfe_u32 v32, v4, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v4, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v21, v30, v21, vcc +; GFX900-NEXT: v_bfe_u32 v30, v4, 16, 1 +; GFX900-NEXT: v_add3_u32 v30, v30, v4, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v4 -; GFX900-NEXT: v_cndmask_b32_e32 v4, v32, v4, vcc -; GFX900-NEXT: v_bfe_u32 v32, v20, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v20, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v30, v4, vcc +; GFX900-NEXT: v_bfe_u32 v30, v20, 16, 1 +; GFX900-NEXT: v_add3_u32 v30, v30, v20, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; GFX900-NEXT: v_or_b32_e32 v20, 0x400000, v20 -; GFX900-NEXT: v_cndmask_b32_e32 v20, v32, v20, vcc -; GFX900-NEXT: v_bfe_u32 v32, v3, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v3, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v20, v30, v20, vcc +; GFX900-NEXT: v_bfe_u32 v30, v3, 16, 1 +; GFX900-NEXT: v_add3_u32 v30, v30, v3, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v3 -; GFX900-NEXT: v_cndmask_b32_e32 v3, v32, v3, vcc -; GFX900-NEXT: v_bfe_u32 v32, v19, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v19, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v30, v3, vcc +; GFX900-NEXT: v_bfe_u32 v30, v19, 16, 1 +; GFX900-NEXT: v_add3_u32 v30, v30, v19, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; GFX900-NEXT: v_or_b32_e32 v19, 0x400000, v19 -; GFX900-NEXT: v_cndmask_b32_e32 v19, v32, v19, vcc -; GFX900-NEXT: v_bfe_u32 v32, v2, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v2, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v19, v30, v19, vcc +; GFX900-NEXT: v_bfe_u32 v30, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v30, v30, v2, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v2 -; GFX900-NEXT: v_cndmask_b32_e32 v2, v32, v2, vcc -; GFX900-NEXT: v_bfe_u32 v32, v18, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v18, s4 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v18 -; GFX900-NEXT: v_cndmask_b32_e32 v18, v32, v18, vcc -; GFX900-NEXT: v_bfe_u32 v32, v1, 16, 1 -; GFX900-NEXT: v_fma_f32 v17, v35, v34, v17 -; GFX900-NEXT: v_add3_u32 v32, v32, v1, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v30, v2, vcc +; GFX900-NEXT: v_bfe_u32 v30, v33, 16, 1 +; GFX900-NEXT: v_add3_u32 v30, v30, v33, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX900-NEXT: v_or_b32_e32 v31, 0x400000, v33 +; GFX900-NEXT: v_cndmask_b32_e32 v30, v30, v31, vcc +; GFX900-NEXT: v_bfe_u32 v31, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v31, v31, v1, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX900-NEXT: v_or_b32_e32 v1, 0x400000, v1 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v32, v1, vcc -; GFX900-NEXT: v_bfe_u32 v32, v17, 16, 1 -; GFX900-NEXT: v_add3_u32 v32, v32, v17, s4 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v17 -; GFX900-NEXT: v_cndmask_b32_e32 v17, v32, v17, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v31, v1, vcc +; GFX900-NEXT: v_bfe_u32 v31, v34, 16, 1 +; GFX900-NEXT: v_add3_u32 v31, v31, v34, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v34 +; GFX900-NEXT: v_cndmask_b32_e32 v31, v31, v32, vcc ; GFX900-NEXT: v_bfe_u32 v32, v0, 16, 1 ; GFX900-NEXT: v_add3_u32 v32, v32, v0, s4 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX900-NEXT: v_or_b32_e32 v0, 0x400000, v0 ; GFX900-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc ; GFX900-NEXT: s_mov_b32 s4, 0x7060302 -; GFX900-NEXT: v_perm_b32 v0, v0, v17, s4 -; GFX900-NEXT: v_perm_b32 v1, v1, v18, s4 +; GFX900-NEXT: v_perm_b32 v0, v0, v31, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v30, s4 ; GFX900-NEXT: v_perm_b32 v2, v2, v19, s4 ; GFX900-NEXT: v_perm_b32 v3, v3, v20, s4 ; GFX900-NEXT: v_perm_b32 v4, v4, v21, s4 @@ -55417,184 +55417,188 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf ; GFX900-NEXT: v_perm_b32 v10, v10, v27, s4 ; GFX900-NEXT: v_perm_b32 v11, v11, v28, s4 ; GFX900-NEXT: v_perm_b32 v12, v12, v29, s4 -; GFX900-NEXT: v_perm_b32 v13, v13, v30, s4 -; GFX900-NEXT: v_perm_b32 v14, v14, v31, s4 +; GFX900-NEXT: v_perm_b32 v13, v13, v18, s4 +; GFX900-NEXT: v_perm_b32 v14, v14, v17, s4 ; GFX900-NEXT: v_perm_b32 v15, v15, v16, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fma_v32bf16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:64 -; GFX950-NEXT: scratch_load_dword v36, off, s32 -; GFX950-NEXT: scratch_load_dword v38, off, s32 offset:60 -; GFX950-NEXT: scratch_load_dword v39, off, s32 offset:56 -; GFX950-NEXT: scratch_load_dword v48, off, s32 offset:52 -; GFX950-NEXT: scratch_load_dword v49, off, s32 offset:48 -; GFX950-NEXT: scratch_load_dword v50, off, s32 offset:44 -; GFX950-NEXT: scratch_load_dword v51, off, s32 offset:40 -; GFX950-NEXT: scratch_load_dword v52, off, s32 offset:36 -; GFX950-NEXT: scratch_load_dword v53, off, s32 offset:32 -; GFX950-NEXT: scratch_load_dword v54, off, s32 offset:28 -; GFX950-NEXT: scratch_load_dword v31, off, s32 offset:4 -; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:8 -; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:12 -; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:16 -; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:20 -; GFX950-NEXT: scratch_load_dword v55, off, s32 offset:24 -; GFX950-NEXT: v_accvgpr_write_b32 a3, v43 ; Reload Reuse +; GFX950-NEXT: scratch_load_dword v31, off, s32 offset:64 +; GFX950-NEXT: scratch_load_dword v32, off, s32 +; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:60 +; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:56 +; GFX950-NEXT: scratch_load_dword v38, off, s32 offset:52 +; GFX950-NEXT: scratch_load_dword v39, off, s32 offset:48 +; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:44 +; GFX950-NEXT: scratch_load_dword v36, off, s32 offset:40 +; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:36 +; GFX950-NEXT: scratch_load_dword v48, off, s32 offset:32 +; GFX950-NEXT: scratch_load_dword v49, off, s32 offset:28 +; GFX950-NEXT: scratch_load_dword v50, off, s32 offset:24 +; GFX950-NEXT: scratch_load_dword v51, off, s32 offset:20 +; GFX950-NEXT: scratch_load_dword v52, off, s32 offset:16 +; GFX950-NEXT: scratch_load_dword v53, off, s32 offset:12 +; GFX950-NEXT: scratch_load_dword v54, off, s32 offset:8 +; GFX950-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse +; GFX950-NEXT: v_and_b32_e32 v42, 0xffff0000, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v44, 16, v14 +; GFX950-NEXT: v_and_b32_e32 v45, 0xffff0000, v29 +; GFX950-NEXT: v_lshlrev_b32_e32 v47, 16, v29 +; GFX950-NEXT: v_and_b32_e32 v58, 0xffff0000, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v59, 16, v28 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a13, v61 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a14, v62 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a15, v63 ; Reload Reuse -; GFX950-NEXT: v_and_b32_e32 v43, 0xffff0000, v14 -; GFX950-NEXT: v_lshlrev_b32_e32 v45, 16, v14 -; GFX950-NEXT: v_and_b32_e32 v46, 0xffff0000, v29 -; GFX950-NEXT: v_lshlrev_b32_e32 v56, 16, v29 -; GFX950-NEXT: v_and_b32_e32 v59, 0xffff0000, v12 -; GFX950-NEXT: v_lshlrev_b32_e32 v61, 16, v12 -; GFX950-NEXT: v_and_b32_e32 v62, 0xffff0000, v27 +; GFX950-NEXT: v_and_b32_e32 v60, 0xffff0000, v27 +; GFX950-NEXT: v_and_b32_e32 v61, 0xffff0000, v11 ; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX950-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse -; GFX950-NEXT: v_and_b32_e32 v42, 0xffff0000, v30 -; GFX950-NEXT: v_lshlrev_b32_e32 v44, 16, v30 -; GFX950-NEXT: v_and_b32_e32 v47, 0xffff0000, v13 -; GFX950-NEXT: v_lshlrev_b32_e32 v57, 16, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse +; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v15 +; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v15 +; GFX950-NEXT: v_and_b32_e32 v57, 0xffff0000, v28 +; GFX950-NEXT: v_accvgpr_write_b32 a14, v62 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse -; GFX950-NEXT: v_and_b32_e32 v40, 0xffff0000, v15 -; GFX950-NEXT: v_lshlrev_b32_e32 v41, 16, v15 -; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse -; GFX950-NEXT: v_and_b32_e32 v58, 0xffff0000, v28 -; GFX950-NEXT: v_lshlrev_b32_e32 v60, 16, v28 -; GFX950-NEXT: s_waitcnt vmcnt(16) -; GFX950-NEXT: v_and_b32_e32 v15, 0xffff0000, v35 +; GFX950-NEXT: v_accvgpr_write_b32 a3, v43 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse +; GFX950-NEXT: v_and_b32_e32 v41, 0xffff0000, v30 +; GFX950-NEXT: v_lshlrev_b32_e32 v43, 16, v30 +; GFX950-NEXT: v_and_b32_e32 v46, 0xffff0000, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v56, 16, v13 ; GFX950-NEXT: s_waitcnt vmcnt(15) -; GFX950-NEXT: v_and_b32_e32 v12, 0xffff0000, v36 -; GFX950-NEXT: v_lshlrev_b32_e32 v63, 16, v36 -; GFX950-NEXT: s_waitcnt vmcnt(14) -; GFX950-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 -; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v38 +; GFX950-NEXT: v_and_b32_e32 v15, 0xffff0000, v31 +; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v31 +; GFX950-NEXT: s_waitcnt vmcnt(13) +; GFX950-NEXT: v_and_b32_e32 v14, 0xffff0000, v33 +; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v33 ; GFX950-NEXT: s_waitcnt vmcnt(11) -; GFX950-NEXT: v_and_b32_e32 v36, 0xffff0000, v49 -; GFX950-NEXT: v_and_b32_e32 v38, 0xffff0000, v11 -; GFX950-NEXT: v_fmac_f32_e32 v36, v38, v62 -; GFX950-NEXT: v_lshlrev_b32_e32 v38, 16, v49 -; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v39 -; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v39 -; GFX950-NEXT: v_fmac_f32_e32 v38, v11, v27 +; GFX950-NEXT: v_lshlrev_b32_e32 v33, 16, v38 +; GFX950-NEXT: v_fmac_f32_e32 v33, v12, v59 +; GFX950-NEXT: s_waitcnt vmcnt(10) +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; GFX950-NEXT: v_fmac_f32_e32 v12, v11, v27 +; GFX950-NEXT: scratch_load_dword v27, off, s32 offset:4 +; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; GFX950-NEXT: v_and_b32_e32 v62, 0xffff0000, v32 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v38 +; GFX950-NEXT: v_fmac_f32_e32 v28, v40, v31 ; GFX950-NEXT: s_waitcnt vmcnt(10) -; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 -; GFX950-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 -; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v10 -; GFX950-NEXT: v_fmac_f32_e32 v11, v39, v27 -; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v50 +; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v35 +; GFX950-NEXT: v_and_b32_e32 v31, 0xffff0000, v26 +; GFX950-NEXT: v_and_b32_e32 v38, 0xffff0000, v10 +; GFX950-NEXT: v_fmac_f32_e32 v11, v38, v31 +; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v35 ; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX950-NEXT: v_fmac_f32_e32 v27, v10, v26 +; GFX950-NEXT: v_fmac_f32_e32 v31, v10, v26 ; GFX950-NEXT: s_waitcnt vmcnt(9) -; GFX950-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 +; GFX950-NEXT: v_and_b32_e32 v10, 0xffff0000, v36 ; GFX950-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 -; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v9 -; GFX950-NEXT: v_fmac_f32_e32 v10, v39, v26 -; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v51 +; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v9 +; GFX950-NEXT: v_fmac_f32_e32 v10, v35, v26 +; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v36 ; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GFX950-NEXT: v_fmac_f32_e32 v26, v9, v25 ; GFX950-NEXT: s_waitcnt vmcnt(8) -; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v52 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 ; GFX950-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 -; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v8 -; GFX950-NEXT: v_fmac_f32_e32 v9, v39, v25 -; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v52 +; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v8 +; GFX950-NEXT: v_fmac_f32_e32 v9, v35, v25 +; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v37 ; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX950-NEXT: v_fmac_f32_e32 v25, v8, v24 ; GFX950-NEXT: s_waitcnt vmcnt(7) -; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v53 +; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v48 ; GFX950-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v7 -; GFX950-NEXT: v_fmac_f32_e32 v8, v39, v24 -; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v53 +; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v7 +; GFX950-NEXT: v_fmac_f32_e32 v8, v35, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v48 ; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX950-NEXT: v_fmac_f32_e32 v24, v7, v23 ; GFX950-NEXT: s_waitcnt vmcnt(6) -; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v54 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 ; GFX950-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 -; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v6 -; GFX950-NEXT: v_fmac_f32_e32 v7, v39, v23 -; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v54 +; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v6 +; GFX950-NEXT: v_fmac_f32_e32 v7, v35, v23 +; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v49 ; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX950-NEXT: v_fmac_f32_e32 v23, v6, v22 -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v55 +; GFX950-NEXT: s_waitcnt vmcnt(5) +; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v50 ; GFX950-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v5 -; GFX950-NEXT: v_fmac_f32_e32 v6, v39, v22 -; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v55 +; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v5 +; GFX950-NEXT: v_fmac_f32_e32 v6, v35, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v50 ; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX950-NEXT: v_fmac_f32_e32 v22, v5, v21 -; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v37 +; GFX950-NEXT: s_waitcnt vmcnt(4) +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v51 ; GFX950-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 -; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v4 -; GFX950-NEXT: v_fmac_f32_e32 v5, v39, v21 -; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v37 +; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v4 +; GFX950-NEXT: v_fmac_f32_e32 v5, v35, v21 +; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v51 ; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX950-NEXT: v_fmac_f32_e32 v21, v4, v20 -; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v34 +; GFX950-NEXT: s_waitcnt vmcnt(3) +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v52 ; GFX950-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 -; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v3 -; GFX950-NEXT: v_fmac_f32_e32 v4, v37, v20 -; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v34 +; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v3 +; GFX950-NEXT: v_fmac_f32_e32 v4, v35, v20 +; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v52 ; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX950-NEXT: v_fmac_f32_e32 v20, v3, v19 -; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 +; GFX950-NEXT: s_waitcnt vmcnt(2) +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v53 ; GFX950-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 -; GFX950-NEXT: v_and_b32_e32 v34, 0xffff0000, v2 -; GFX950-NEXT: v_fmac_f32_e32 v3, v34, v19 -; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v33 +; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v2 +; GFX950-NEXT: v_fmac_f32_e32 v3, v35, v19 +; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v53 ; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX950-NEXT: v_fmac_f32_e32 v19, v2, v18 -; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v32 +; GFX950-NEXT: s_waitcnt vmcnt(1) +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 ; GFX950-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 -; GFX950-NEXT: v_and_b32_e32 v33, 0xffff0000, v1 -; GFX950-NEXT: v_fmac_f32_e32 v2, v33, v18 -; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v1 +; GFX950-NEXT: v_fmac_f32_e32 v2, v35, v18 +; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v54 ; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX950-NEXT: v_fmac_f32_e32 v18, v1, v17 -; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 ; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 -; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v35 -; GFX950-NEXT: v_fmac_f32_e32 v15, v40, v12 -; GFX950-NEXT: v_and_b32_e32 v12, 0xffff0000, v48 -; GFX950-NEXT: v_lshlrev_b32_e32 v35, 16, v48 -; GFX950-NEXT: v_fmac_f32_e32 v1, v32, v17 -; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v31 +; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v0 +; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v34 +; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v34 +; GFX950-NEXT: v_and_b32_e32 v34, 0xffff0000, v39 +; GFX950-NEXT: v_fmac_f32_e32 v1, v35, v17 +; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v27 ; GFX950-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX950-NEXT: v_fmac_f32_e32 v28, v41, v63 -; GFX950-NEXT: v_fmac_f32_e32 v14, v43, v42 -; GFX950-NEXT: v_fmac_f32_e32 v29, v45, v44 -; GFX950-NEXT: v_fmac_f32_e32 v13, v47, v46 -; GFX950-NEXT: v_fmac_f32_e32 v30, v57, v56 -; GFX950-NEXT: v_fmac_f32_e32 v12, v59, v58 -; GFX950-NEXT: v_fmac_f32_e32 v35, v61, v60 +; GFX950-NEXT: v_fmac_f32_e32 v15, v55, v62 +; GFX950-NEXT: v_fmac_f32_e32 v14, v42, v41 +; GFX950-NEXT: v_fmac_f32_e32 v29, v44, v43 +; GFX950-NEXT: v_fmac_f32_e32 v13, v46, v45 +; GFX950-NEXT: v_fmac_f32_e32 v30, v56, v47 +; GFX950-NEXT: v_fmac_f32_e32 v32, v58, v57 +; GFX950-NEXT: v_fmac_f32_e32 v34, v61, v60 ; GFX950-NEXT: v_fmac_f32_e32 v17, v0, v16 ; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v17, v1 ; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v18, v2 @@ -55606,13 +55610,12 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf ; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v24, v8 ; GFX950-NEXT: v_cvt_pk_bf16_f32 v8, v25, v9 ; GFX950-NEXT: v_cvt_pk_bf16_f32 v9, v26, v10 -; GFX950-NEXT: v_cvt_pk_bf16_f32 v10, v27, v11 -; GFX950-NEXT: v_cvt_pk_bf16_f32 v11, v38, v36 -; GFX950-NEXT: v_cvt_pk_bf16_f32 v12, v35, v12 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v10, v31, v11 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v11, v12, v34 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v12, v33, v32 ; GFX950-NEXT: v_cvt_pk_bf16_f32 v13, v30, v13 ; GFX950-NEXT: v_cvt_pk_bf16_f32 v14, v29, v14 ; GFX950-NEXT: v_cvt_pk_bf16_f32 v15, v28, v15 -; GFX950-NEXT: v_accvgpr_read_b32 v63, a15 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_read_b32 v62, a14 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_read_b32 v61, a13 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_read_b32 v60, a12 ; Reload Reuse @@ -55700,33 +55703,33 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf ; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v38 ; GFX10-NEXT: v_and_b32_e32 v51, 0xffff0000, v26 -; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v38 +; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v38 ; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v25 ; GFX10-NEXT: s_waitcnt vmcnt(6) -; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v39 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v39 ; GFX10-NEXT: v_fmac_f32_e32 v27, v50, v49 ; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v9 -; GFX10-NEXT: v_fmac_f32_e32 v10, v52, v51 +; GFX10-NEXT: v_fmac_f32_e32 v26, v52, v51 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 ; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:8 ; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v31 -; GFX10-NEXT: v_fmac_f32_e32 v26, v49, v38 +; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX10-NEXT: v_fmac_f32_e32 v10, v49, v38 ; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 -; GFX10-NEXT: v_and_b32_e32 v49, 0xffff0000, v9 -; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 -; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v24 +; GFX10-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 +; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v24 ; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v31 ; GFX10-NEXT: v_cmp_u_f32_e64 s14, v31, v31 -; GFX10-NEXT: v_fmac_f32_e32 v9, v49, v25 -; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v8 +; GFX10-NEXT: v_fmac_f32_e32 v39, v9, v25 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v8 ; GFX10-NEXT: s_waitcnt vmcnt(8) ; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v48 ; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX10-NEXT: v_and_b32_e32 v48, 0xffff0000, v48 -; GFX10-NEXT: v_fmac_f32_e32 v25, v49, v39 -; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; GFX10-NEXT: v_fmac_f32_e32 v25, v9, v49 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v23 ; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v7 ; GFX10-NEXT: v_fmac_f32_e32 v48, v8, v24 ; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 @@ -55737,8 +55740,8 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf ; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v33 ; GFX10-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; GFX10-NEXT: v_fmac_f32_e32 v8, v49, v39 -; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v6 +; GFX10-NEXT: v_fmac_f32_e32 v8, v49, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v6 ; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX10-NEXT: v_fmac_f32_e32 v33, v7, v23 ; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v21 @@ -55748,10 +55751,10 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf ; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v5 ; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_fmac_f32_e32 v7, v39, v24 +; GFX10-NEXT: v_fmac_f32_e32 v7, v9, v24 ; GFX10-NEXT: v_fmac_f32_e32 v34, v6, v22 -; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v20 -; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v20 +; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v4 ; GFX10-NEXT: s_waitcnt vmcnt(5) ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v35 ; GFX10-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 @@ -55768,17 +55771,17 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf ; GFX10-NEXT: v_and_b32_e32 v36, 0xffff0000, v36 ; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v18 ; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v2 -; GFX10-NEXT: v_fmac_f32_e32 v5, v39, v24 -; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX10-NEXT: v_fmac_f32_e32 v5, v24, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v17 ; GFX10-NEXT: v_fmac_f32_e32 v36, v4, v20 -; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v16 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1 ; GFX10-NEXT: s_waitcnt vmcnt(3) -; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v37 +; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v37 +; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v16 ; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v17 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; GFX10-NEXT: v_fmac_f32_e32 v39, v23, v22 +; GFX10-NEXT: v_fmac_f32_e32 v24, v23, v22 ; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v37 ; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -55795,36 +55798,36 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v38 ; GFX10-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 +; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v9 +; GFX10-NEXT: v_bfe_u32 v9, v33, 16, 1 ; GFX10-NEXT: v_fmac_f32_e32 v37, v21, v49 ; GFX10-NEXT: v_fmac_f32_e32 v50, v2, v18 -; GFX10-NEXT: v_fmac_f32_e32 v19, v1, v17 -; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v48 ; GFX10-NEXT: v_fmac_f32_e32 v38, v0, v16 ; GFX10-NEXT: v_bfe_u32 v0, v48, 16, 1 -; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1 +; GFX10-NEXT: v_fmac_f32_e32 v19, v1, v17 +; GFX10-NEXT: v_fmac_f32_e32 v51, v22, v20 +; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v48 ; GFX10-NEXT: v_bfe_u32 v2, v8, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v33 -; GFX10-NEXT: v_bfe_u32 v18, v7, 16, 1 -; GFX10-NEXT: v_bfe_u32 v21, v34, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v33 +; GFX10-NEXT: v_bfe_u32 v17, v7, 16, 1 +; GFX10-NEXT: v_bfe_u32 v20, v34, 16, 1 ; GFX10-NEXT: v_add3_u32 v0, v0, v48, 0x7fff ; GFX10-NEXT: v_bfe_u32 v48, v35, 16, 1 -; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff +; GFX10-NEXT: v_add3_u32 v9, v9, v33, 0x7fff ; GFX10-NEXT: v_bfe_u32 v33, v5, 16, 1 -; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v24 -; GFX10-NEXT: v_fmac_f32_e32 v51, v22, v20 ; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v8 -; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v7 -; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v34 -; GFX10-NEXT: v_bfe_u32 v24, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v7 +; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v34 +; GFX10-NEXT: v_bfe_u32 v22, v6, 16, 1 ; GFX10-NEXT: v_add3_u32 v2, v2, v8, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v8, v8 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v35 -; GFX10-NEXT: v_add3_u32 v18, v18, v7, 0x7fff +; GFX10-NEXT: v_add3_u32 v17, v17, v7, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s6, v7, v7 ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX10-NEXT: v_add3_u32 v21, v21, v34, 0x7fff +; GFX10-NEXT: v_add3_u32 v20, v20, v34, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s7, v34, v34 -; GFX10-NEXT: v_bfe_u32 v34, v39, 16, 1 +; GFX10-NEXT: v_bfe_u32 v34, v24, 16, 1 ; GFX10-NEXT: v_add3_u32 v48, v48, v35, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s9, v35, v35 ; GFX10-NEXT: v_bfe_u32 v35, v23, 16, 1 @@ -55832,12 +55835,12 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf ; GFX10-NEXT: v_cmp_u_f32_e64 s10, v5, v5 ; GFX10-NEXT: v_bfe_u32 v5, v37, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v6 -; GFX10-NEXT: v_add3_u32 v24, v24, v6, 0x7fff +; GFX10-NEXT: v_add3_u32 v22, v22, v6, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s8, v6, v6 -; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v39 -; GFX10-NEXT: v_add3_u32 v34, v34, v39, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s11, v39, v39 -; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v23 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v24 +; GFX10-NEXT: v_add3_u32 v34, v34, v24, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s11, v24, v24 +; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v23 ; GFX10-NEXT: v_add3_u32 v35, v35, v23, 0x7fff ; GFX10-NEXT: v_cmp_u_f32_e64 s12, v23, v23 ; GFX10-NEXT: v_or_b32_e32 v23, 0x400000, v37 @@ -55846,32 +55849,32 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf ; GFX10-NEXT: v_bfe_u32 v37, v31, 16, 1 ; GFX10-NEXT: v_cndmask_b32_e64 v53, v2, v4, s4 ; GFX10-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v17, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v18, v20, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v16, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v16, v17, v18, s6 ; GFX10-NEXT: v_add3_u32 v37, v37, v31, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v18, v21, v22, s7 -; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v3 -; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v17, v20, v21, s7 +; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v21, v19, 16, 1 ; GFX10-NEXT: v_add3_u32 v4, v4, v3, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e64 v31, v37, v52, s14 ; GFX10-NEXT: v_bfe_u32 v37, v15, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v15 ; GFX10-NEXT: v_cmp_u_f32_e64 s14, v15, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v21, v24, v49, s8 -; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v19 +; GFX10-NEXT: v_cndmask_b32_e64 v20, v22, v49, s8 +; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v19 ; GFX10-NEXT: v_add3_u32 v37, v37, v15, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e64 v7, v33, v7, s10 ; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1 -; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff +; GFX10-NEXT: v_add3_u32 v21, v21, v19, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e64 v6, v34, v6, s11 ; GFX10-NEXT: v_cndmask_b32_e64 v15, v37, v52, s14 ; GFX10-NEXT: v_bfe_u32 v37, v32, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v32 ; GFX10-NEXT: v_cmp_u_f32_e64 s14, v32, v32 ; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v51 -; GFX10-NEXT: v_cndmask_b32_e64 v35, v35, v39, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v24, v35, v24, s12 ; GFX10-NEXT: v_add3_u32 v37, v37, v32, 0x7fff -; GFX10-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX10-NEXT: v_bfe_u32 v35, v38, 16, 1 ; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v23, s13 ; GFX10-NEXT: v_or_b32_e32 v23, 0x400000, v38 @@ -55879,7 +55882,7 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf ; GFX10-NEXT: v_bfe_u32 v37, v14, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v14 ; GFX10-NEXT: v_cmp_u_f32_e64 s14, v14, v14 -; GFX10-NEXT: v_add3_u32 v39, v39, v38, 0x7fff +; GFX10-NEXT: v_add3_u32 v35, v35, v38, 0x7fff ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v50 ; GFX10-NEXT: v_add3_u32 v37, v37, v14, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e64 v8, v48, v8, s9 @@ -55924,55 +55927,55 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf ; GFX10-NEXT: v_perm_b32 v11, v11, v28, 0x7060302 ; GFX10-NEXT: v_add3_u32 v37, v37, v27, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e64 v27, v37, v52, s14 -; GFX10-NEXT: v_bfe_u32 v37, v10, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v10 -; GFX10-NEXT: v_cmp_u_f32_e64 s14, v10, v10 -; GFX10-NEXT: v_add3_u32 v37, v37, v10, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v10, v37, v52, s14 ; GFX10-NEXT: v_bfe_u32 v37, v26, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v26 ; GFX10-NEXT: v_cmp_u_f32_e64 s14, v26, v26 -; GFX10-NEXT: v_perm_b32 v10, v10, v27, 0x7060302 ; GFX10-NEXT: v_add3_u32 v37, v37, v26, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e64 v26, v37, v52, s14 -; GFX10-NEXT: v_bfe_u32 v37, v9, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v9 -; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9 -; GFX10-NEXT: v_add3_u32 v37, v37, v9, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v9, v37, v52, s14 -; GFX10-NEXT: v_bfe_u32 v37, v25, 16, 1 +; GFX10-NEXT: v_bfe_u32 v37, v10, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v10 +; GFX10-NEXT: v_cmp_u_f32_e64 s14, v10, v10 +; GFX10-NEXT: v_add3_u32 v37, v37, v10, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v10, v37, v52, s14 +; GFX10-NEXT: v_bfe_u32 v37, v39, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v39 +; GFX10-NEXT: v_cmp_u_f32_e64 s14, v39, v39 +; GFX10-NEXT: v_add3_u32 v37, v37, v39, 0x7fff +; GFX10-NEXT: v_bfe_u32 v39, v25, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v52, s14 +; GFX10-NEXT: v_add3_u32 v39, v39, v25, 0x7fff ; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v25 ; GFX10-NEXT: v_cmp_u_f32_e64 s14, v25, v25 -; GFX10-NEXT: v_perm_b32 v9, v9, v26, 0x7060302 -; GFX10-NEXT: v_add3_u32 v37, v37, v25, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e64 v25, v37, v52, s14 +; GFX10-NEXT: v_cndmask_b32_e64 v25, v39, v52, s14 ; GFX10-NEXT: v_cndmask_b32_e32 v52, v0, v1, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX10-NEXT: v_bfe_u32 v1, v50, 16, 1 -; GFX10-NEXT: v_bfe_u32 v37, v36, 16, 1 +; GFX10-NEXT: v_bfe_u32 v39, v36, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v36 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v20, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v18, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 ; GFX10-NEXT: v_add3_u32 v1, v1, v50, 0x7fff -; GFX10-NEXT: v_add3_u32 v37, v37, v36, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v4, v22, v24, vcc_lo +; GFX10-NEXT: v_add3_u32 v39, v39, v36, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v21, v22, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 -; GFX10-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX10-NEXT: v_cndmask_b32_e32 v20, v39, v23, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v19, v35, v23, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX10-NEXT: v_perm_b32 v1, v4, v3, 0x7060302 -; GFX10-NEXT: v_perm_b32 v3, v35, v6, 0x7060302 -; GFX10-NEXT: v_perm_b32 v6, v18, v17, 0x7060302 +; GFX10-NEXT: v_perm_b32 v3, v24, v6, 0x7060302 +; GFX10-NEXT: v_perm_b32 v6, v17, v16, 0x7060302 ; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302 -; GFX10-NEXT: v_cndmask_b32_e32 v22, v37, v0, vcc_lo -; GFX10-NEXT: v_perm_b32 v0, v20, v19, 0x7060302 -; GFX10-NEXT: v_perm_b32 v5, v8, v21, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v21, v39, v0, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v19, v18, 0x7060302 +; GFX10-NEXT: v_perm_b32 v5, v8, v20, 0x7060302 ; GFX10-NEXT: v_perm_b32 v8, v52, v25, 0x7060302 -; GFX10-NEXT: v_perm_b32 v4, v22, v7, 0x7060302 -; GFX10-NEXT: v_perm_b32 v7, v16, v53, 0x7060302 +; GFX10-NEXT: v_perm_b32 v4, v21, v7, 0x7060302 +; GFX10-NEXT: v_perm_b32 v7, v9, v53, 0x7060302 +; GFX10-NEXT: v_perm_b32 v9, v37, v10, 0x7060302 +; GFX10-NEXT: v_perm_b32 v10, v26, v27, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11TRUE16-LABEL: v_fma_v32bf16: diff --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll index 6a241dfa463bd..31ff0572bfd29 100644 --- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll @@ -2403,7 +2403,6 @@ define amdgpu_vs <2 x bfloat> @load_v2bf16(ptr addrspace(6) inreg %p0, ptr addrs ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: load_v2bf16: @@ -2438,7 +2437,6 @@ define amdgpu_vs <2 x bfloat> @load_v2bf16(ptr addrspace(6) inreg %p0, ptr addrs ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog %gep1 = getelementptr inbounds <2 x bfloat>, ptr addrspace(6) %p1, i32 2 %r0 = load <2 x bfloat>, ptr addrspace(6) %p0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll index f3bca31e3fd9e..06b404e9f2c05 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll @@ -515,36 +515,37 @@ define <2 x bfloat> @v_exp2_fabs_v2bf16(<2 x bfloat> %in) { ; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff, v0 -; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v0 -; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu depctr_va_sdst(0) -; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, s0 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-SDAG-TRUE16-NEXT: v_dual_add_f32 v0, v0, v3 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0 -; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(TRANS32_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX1200-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0x7fff, v0.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo -; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v3 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v2, v1, v2 +; GFX1200-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0x7fff, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu depctr_va_sdst(0) +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 0x42800000, s0 +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v1, v2 ; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo -; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v1, v1 -; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instid1(VALU_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v3 ; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo ; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -561,68 +562,66 @@ define <2 x bfloat> @v_exp2_fabs_v2bf16(<2 x bfloat> %in) { ; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v0 -; GFX1200-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 -; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_sdst(0) +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, s0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-SDAG-FAKE16-NEXT: v_dual_add_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0x7fff, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0 +; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v1, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v1, v1, v3 +; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX1200-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-SDAG-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo -; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v0, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX1200-SDAG-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1 -; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_sdst(0) -; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, s0 -; GFX1200-SDAG-FAKE16-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0 -; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v1, v1 -; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v1, v1, v3 -; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX1200-SDAG-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff ; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo -; GFX1200-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-SDAG-TRUE16-LABEL: v_exp2_fabs_v2bf16: ; GFX1250-SDAG-TRUE16: ; %bb.0: ; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 -; GFX1250-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 15 -; GFX1250-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e32 v0.l, v1.l -; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e32 v0.h, v2.l +; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e64 v0.h, |v0.h| +; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e64 v0.l, |v0.l| ; GFX1250-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-SDAG-FAKE16-LABEL: v_exp2_fabs_v2bf16: ; GFX1250-SDAG-FAKE16: ; %bb.0: ; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 -; GFX1250-SDAG-FAKE16-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e32 v1, v1 -; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e32 v0, v0 +; GFX1250-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e64 v0, |v0| +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e64 v1, |v1| ; GFX1250-SDAG-FAKE16-NEXT: v_nop -; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) -; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX1250-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in) %result = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> %fabs) @@ -637,47 +636,44 @@ define <2 x bfloat> @v_exp2_fneg_fabs_v2bf16(<2 x bfloat> %in) { ; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 15 -; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v2.h, 0x8000, v1.l -; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v2 +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX1200-SDAG-TRUE16-NEXT: v_or_b16 v1.h, 0x8000, v0.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo -; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, vcc_lo -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v2.h, 0x8000, v0.l -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v1, v1 -; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v2 +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v2, v1, v2 +; GFX1200-SDAG-TRUE16-NEXT: v_or_b16 v1.h, 0x8000, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu depctr_va_sdst(0) -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 0x42800000, s0 -; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v1, v1, v3 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v2, v0 -; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, s0 -; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v1, v2 +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo ; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0 -; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v3 +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h ; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fneg_fabs_v2bf16: @@ -687,73 +683,66 @@ define <2 x bfloat> @v_exp2_fneg_fabs_v2bf16(<2 x bfloat> %in) { ; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 -; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX1200-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1 ; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, s0 -; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1200-SDAG-FAKE16-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX1200-SDAG-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_dual_add_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 ; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0 -; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v0, v0 -; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1 -; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v0, v0, v3 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v1, v1 ; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo -; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1200-SDAG-FAKE16-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1200-SDAG-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo -; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v1, v1 -; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v1, v1, v2 -; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX1200-SDAG-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v1, v1, v3 +; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v0, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX1200-SDAG-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo -; GFX1200-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0) +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-SDAG-TRUE16-LABEL: v_exp2_fneg_fabs_v2bf16: ; GFX1250-SDAG-TRUE16: ; %bb.0: ; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 -; GFX1250-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 15 -; GFX1250-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e64 v0.l, -v1.l -; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e64 v0.h, -v2.l +; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e64 v0.h, -|v0.h| +; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e64 v0.l, -|v0.l| ; GFX1250-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-SDAG-FAKE16-LABEL: v_exp2_fneg_fabs_v2bf16: ; GFX1250-SDAG-FAKE16: ; %bb.0: ; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 -; GFX1250-SDAG-FAKE16-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e64 v1, -v1 -; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e64 v0, -v0 +; GFX1250-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e64 v0, -|v0| +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e64 v1, -|v1| ; GFX1250-SDAG-FAKE16-NEXT: v_nop -; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) -; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX1250-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in) %fneg.fabs = fneg <2 x bfloat> %fabs diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.bf16.ll index 5bd9fa6f23aa0..a61cf8359d3bd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.bf16.ll @@ -127,25 +127,20 @@ define <2 x bfloat> @v_log2_fabs_v2bf16(<2 x bfloat> %in) { ; GFX-SDAG-TRUE16: ; %bb.0: ; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 -; GFX-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 15 -; GFX-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.l, v1.l -; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.h, v2.l +; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.h, |v0.h| +; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.l, |v0.l| ; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX-SDAG-FAKE16-LABEL: v_log2_fabs_v2bf16: ; GFX-SDAG-FAKE16: ; %bb.0: ; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 -; GFX-SDAG-FAKE16-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v1, v1 -; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v0, v0 +; GFX-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v0, |v0| +; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v1, |v1| ; GFX-SDAG-FAKE16-NEXT: v_nop -; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) -; GFX-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in) %result = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %fabs) @@ -157,25 +152,20 @@ define <2 x bfloat> @v_log2_fneg_fabs_v2bf16(<2 x bfloat> %in) { ; GFX-SDAG-TRUE16: ; %bb.0: ; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 -; GFX-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 15 -; GFX-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.l, -v1.l -; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.h, -v2.l +; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.h, -|v0.h| +; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.l, -|v0.l| ; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31] ; ; GFX-SDAG-FAKE16-LABEL: v_log2_fneg_fabs_v2bf16: ; GFX-SDAG-FAKE16: ; %bb.0: ; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 -; GFX-SDAG-FAKE16-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v1, -v1 -; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v0, -v0 +; GFX-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v0, -|v0| +; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v1, -|v1| ; GFX-SDAG-FAKE16-NEXT: v_nop -; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) -; GFX-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in) %fneg.fabs = fneg <2 x bfloat> %fabs diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll index ccb807695e270..cd387b5a429e9 100644 --- a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll @@ -10098,21 +10098,20 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: scratch_load_b32 v55, off, s32 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v15 :: v_dual_mov_b32 v48, v13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v37, v12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v10 :: v_dual_mov_b32 v50, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v9 :: v_dual_and_b32 v8, 0xffff0000, v53 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v31, v10 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v33, v8 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v14 :: v_dual_mov_b32 v34, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v14 :: v_dual_and_b32 v8, 0xffff0000, v53 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v11 :: v_dual_mov_b32 v37, v12 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v36, v9 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v30 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v24 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v23 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v21 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v22 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v48 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v19 @@ -10180,7 +10179,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v34.h, v27.h, s1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v26 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v27.h, v54.h, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v54, v8 @@ -10194,7 +10193,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v31.h, v26.h, s1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v25 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v33 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v26.h, v54.h, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v54, v8 @@ -10205,7 +10204,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v8.h, v54.h, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v39.h, v25.h, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v36.h, v25.h, s1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v25.h, v54.h, vcc_lo @@ -10217,7 +10216,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v8.h, v54.h, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v50.h, v24.h, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v33.h, v24.h, s1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v24.h, v54.h, vcc_lo @@ -10229,39 +10228,41 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54.h ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8 ; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.h, v54.h, s0 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v7.h, v23.h, s1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v23.h, v54.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v33, v33 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v35, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v54, v32 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54.h ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v32 ; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.h, v32.h, v54.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.h, v32.h, v54.h, s0 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v6.h, v22.h, s1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v22.h, v54.h, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v21 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v35, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v22.h, v54.h, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v54, v32 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v32 ; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v32.h, v54.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.h, v32.h, v54.h, s0 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v5.h, v21.h, s1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v20 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v35, v35 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v54.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v21.h, v54.h, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v54, v32 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo @@ -10421,7 +10422,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v54.h, v28.h, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v26.l, v27.h, s2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v36 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v27.h ; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v27, v54 @@ -10431,12 +10432,12 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v54.l ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v54 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v26.h, v39.l, v25.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v26.h, v36.l, v25.l, vcc_lo ; GFX11-TRUE16-NEXT: s_and_b32 s0, s1, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v54.h, v27.h, s0 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v25.l, v26.h, s2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v33 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v24 ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v26.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -10447,7 +10448,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v54.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v54 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v25.h, v50.l, v24.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v25.h, v33.l, v24.l, vcc_lo ; GFX11-TRUE16-NEXT: s_and_b32 s0, s1, s0 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v54.h, v26.h, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) @@ -10480,7 +10481,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v54.l ; GFX11-TRUE16-NEXT: s_and_b32 s0, s1, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v54.h, v7.h, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.l, v54.h, v7.h, s0 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v22.l, v6.h, s2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v21 @@ -10496,13 +10497,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.l, v21.l, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v54.l ; GFX11-TRUE16-NEXT: s_and_b32 s0, s1, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v54.h, v6.h, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.l, v54.h, v6.h, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v21.l, v5.h, s2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4 ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.h ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v33 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v50 ; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v54 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v54.h, v5.h, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 @@ -10518,7 +10519,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3 ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.h ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v36 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v39 ; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v54 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v54.h, v4.h, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 @@ -11168,21 +11169,20 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: scratch_load_b32 v55, off, s32 ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v53, v15 :: v_dual_mov_b32 v48, v13 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v50, v8 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v31, v10 :: v_dual_and_b32 v8, 0xffff0000, v53 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v39, v9 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v31, v10 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v33, v8 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v51, v14 :: v_dual_mov_b32 v34, v11 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v51, v14 :: v_dual_and_b32 v8, 0xffff0000, v53 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v34, v11 :: v_dual_mov_b32 v37, v12 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v36, v9 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v30 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v24 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v51 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v23 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v22 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v21 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v20 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v30 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v24 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v23 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v22 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v48 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v19 @@ -11260,7 +11260,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v34.h, v27.h, s1 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v26 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v27.h, v54.h, vcc_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) @@ -11276,7 +11276,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v31.h, v26.h, s1 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v25 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v50 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v33 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v26.h, v54.h, vcc_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) @@ -11289,7 +11289,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v8.h, v54.h, s0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v39.h, v25.h, s1 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v36.h, v25.h, s1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -11304,7 +11304,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v8.h, v54.h, s0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v50.h, v24.h, s1 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v33.h, v24.h, s1 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v7 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -11318,53 +11318,53 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54.h ; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8 ; GFX12-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.h, v54.h, s0 ; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v7.h, v23.h, s1 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v6 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v23.h, v54.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v33, v33 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v35, v35 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v54, v32 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54.h ; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v32 ; GFX12-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.h, v32.h, v54.h, s0 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.h, v32.h, v54.h, s0 ; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v6.h, v22.h, s1 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v5 -; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v22.h, v54.h, vcc_lo +; GFX12-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v21 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v35, v35 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v22.h, v54.h, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v54, v32 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54.h ; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v32 ; GFX12-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v32.h, v54.h, s0 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.h, v32.h, v54.h, s0 ; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v5.h, v21.h, s1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v20 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v35, v35 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v35.l, v54.l ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v21.h, v54.h, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v54, v32 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v32 ; GFX12-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 @@ -11548,7 +11548,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v54.h, v28.h, s0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v26.l, v27.h, s2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v39 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v36 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v27.h ; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v27, v54 @@ -11560,13 +11560,13 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v54.l ; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v54 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v26.h, v39.l, v25.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v26.h, v36.l, v25.l, vcc_lo ; GFX12-TRUE16-NEXT: s_and_b32 s0, s1, s0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v54.h, v27.h, s0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v25.l, v26.h, s2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v50 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v33 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v24 ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v26.h ; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v26, v54 @@ -11578,7 +11578,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v54.l ; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v54 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v25.h, v50.l, v24.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v25.h, v33.l, v24.l, vcc_lo ; GFX12-TRUE16-NEXT: s_and_b32 s0, s1, s0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v54.h, v26.h, s0 @@ -11616,7 +11616,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v54.l ; GFX12-TRUE16-NEXT: s_and_b32 s0, s1, s0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v54.h, v7.h, s0 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.l, v54.h, v7.h, s0 ; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v22.l, v6.h, s2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v21 @@ -11635,12 +11635,12 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v54.l ; GFX12-TRUE16-NEXT: s_and_b32 s0, s1, s0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v54.h, v6.h, s0 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.l, v54.h, v6.h, s0 ; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v21.l, v5.h, s2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4 ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.h ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v33 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v50 ; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v54 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v54.h, v5.h, vcc_lo @@ -11658,7 +11658,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3 ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.h ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v36 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v39 ; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v54 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v54.h, v4.h, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll index 246fa7d41e1ef..dc47782c15281 100644 --- a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll @@ -10131,21 +10131,20 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: scratch_load_b32 v55, off, s32 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v15 :: v_dual_mov_b32 v48, v13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v37, v12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v10 :: v_dual_mov_b32 v50, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v9 :: v_dual_and_b32 v8, 0xffff0000, v53 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v31, v10 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v33, v8 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v14 :: v_dual_mov_b32 v34, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v14 :: v_dual_and_b32 v8, 0xffff0000, v53 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v11 :: v_dual_mov_b32 v37, v12 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v36, v9 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v30 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v24 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v23 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v21 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v22 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v48 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v19 @@ -10213,7 +10212,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v34.h, v27.h, s1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v26 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v27.h, v54.h, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v54, v8 @@ -10227,7 +10226,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v31.h, v26.h, s1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v25 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v33 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v26.h, v54.h, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v54, v8 @@ -10238,7 +10237,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v8.h, v54.h, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v39.h, v25.h, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v36.h, v25.h, s1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v25.h, v54.h, vcc_lo @@ -10250,7 +10249,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v8.h, v54.h, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v50.h, v24.h, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v33.h, v24.h, s1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v24.h, v54.h, vcc_lo @@ -10262,39 +10261,41 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8 ; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.h, v54.h, s0 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v7.h, v23.h, s1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v23.h, v54.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v33, v33 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v35, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v54, v32 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v32 ; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.h, v32.h, v54.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.h, v32.h, v54.h, s0 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v6.h, v22.h, s1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v22.h, v54.h, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v21 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v35, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v22.h, v54.h, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v54, v32 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v32 ; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v32.h, v54.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.h, v32.h, v54.h, s0 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v5.h, v21.h, s1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v20 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v35, v35 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v54.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v21.h, v54.h, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v54, v32 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo @@ -10454,7 +10455,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v54.h, v28.h, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v26.l, v27.h, s2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v36 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v27.h ; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v27, v54 @@ -10464,12 +10465,12 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v54.l ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v54 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v26.h, v39.l, v25.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v26.h, v36.l, v25.l, vcc_lo ; GFX11-TRUE16-NEXT: s_and_b32 s0, s1, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v54.h, v27.h, s0 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v25.l, v26.h, s2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v33 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v24 ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v26.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -10480,7 +10481,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v54.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v54 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v25.h, v50.l, v24.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v25.h, v33.l, v24.l, vcc_lo ; GFX11-TRUE16-NEXT: s_and_b32 s0, s1, s0 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v54.h, v26.h, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) @@ -10513,7 +10514,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v54.l ; GFX11-TRUE16-NEXT: s_and_b32 s0, s1, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v54.h, v7.h, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.l, v54.h, v7.h, s0 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v22.l, v6.h, s2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v21 @@ -10529,13 +10530,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.l, v21.l, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v54.l ; GFX11-TRUE16-NEXT: s_and_b32 s0, s1, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v54.h, v6.h, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.l, v54.h, v6.h, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v21.l, v5.h, s2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4 ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v5.h ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v33 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v50 ; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v54 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v54.h, v5.h, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 @@ -10551,7 +10552,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3 ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.h ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v36 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v39 ; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v54 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v54.h, v4.h, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 @@ -11201,21 +11202,20 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: scratch_load_b32 v55, off, s32 ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v53, v15 :: v_dual_mov_b32 v48, v13 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v50, v8 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v31, v10 :: v_dual_and_b32 v8, 0xffff0000, v53 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v39, v9 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v31, v10 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v33, v8 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v51, v14 :: v_dual_mov_b32 v34, v11 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v51, v14 :: v_dual_and_b32 v8, 0xffff0000, v53 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v34, v11 :: v_dual_mov_b32 v37, v12 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v36, v9 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v30 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v24 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v51 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v23 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v22 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v21 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v20 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v30 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v24 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v23 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v22 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v48 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v19 @@ -11293,7 +11293,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v34.h, v27.h, s1 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v26 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v27.h, v54.h, vcc_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) @@ -11309,7 +11309,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v31.h, v26.h, s1 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v25 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v50 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v33 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v26.h, v54.h, vcc_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) @@ -11322,7 +11322,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v8.h, v54.h, s0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v39.h, v25.h, s1 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v36.h, v25.h, s1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -11337,7 +11337,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v8.h, v54.h, s0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v50.h, v24.h, s1 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v33.h, v24.h, s1 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v7 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -11351,53 +11351,53 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h ; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8 ; GFX12-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.h, v54.h, s0 ; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v7.h, v23.h, s1 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v6 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v23.h, v54.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v33, v33 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v35, v35 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v54, v32 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h ; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v32 ; GFX12-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.h, v32.h, v54.h, s0 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.h, v32.h, v54.h, s0 ; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v6.h, v22.h, s1 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v5 -; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v22.h, v54.h, vcc_lo +; GFX12-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v21 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v35, v35 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v22.h, v54.h, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v54, v32 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h ; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v32 ; GFX12-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v32.h, v54.h, s0 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.h, v32.h, v54.h, s0 ; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v5.h, v21.h, s1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v20 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v35, v35 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v35.l, v54.l ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v21.h, v54.h, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v54, v32 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v54.h, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v32 ; GFX12-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 @@ -11581,7 +11581,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v54.h, v28.h, s0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v26.l, v27.h, s2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v39 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v36 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v27.h ; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v27, v54 @@ -11593,13 +11593,13 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v54.l ; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v54 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v26.h, v39.l, v25.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v26.h, v36.l, v25.l, vcc_lo ; GFX12-TRUE16-NEXT: s_and_b32 s0, s1, s0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v54.h, v27.h, s0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v25.l, v26.h, s2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v50 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v33 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v24 ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v26.h ; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v26, v54 @@ -11611,7 +11611,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v54.l ; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v54 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v25.h, v50.l, v24.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v25.h, v33.l, v24.l, vcc_lo ; GFX12-TRUE16-NEXT: s_and_b32 s0, s1, s0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v54.h, v26.h, s0 @@ -11649,7 +11649,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v54.l ; GFX12-TRUE16-NEXT: s_and_b32 s0, s1, s0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v54.h, v7.h, s0 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.l, v54.h, v7.h, s0 ; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v22.l, v6.h, s2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v21 @@ -11668,12 +11668,12 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v54.l ; GFX12-TRUE16-NEXT: s_and_b32 s0, s1, s0 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v54.h, v6.h, s0 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.l, v54.h, v6.h, s0 ; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v21.l, v5.h, s2 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4 ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v5.h ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v33 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v50 ; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v54 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v54.h, v5.h, vcc_lo @@ -11691,7 +11691,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3 ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.h ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v36 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v39 ; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v54 ; GFX12-TRUE16-NEXT: s_wait_alu depctr_va_vcc(0) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v54.h, v4.h, vcc_lo